From 2a6832e697b6d9660ff853e6dc893332b6376cf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20G=C3=B3rski?= Date: Thu, 2 Nov 2023 17:53:09 +0100 Subject: [PATCH] 6.6: EEVDF adaptation to BORE 3.2.9 (#832) Signed-off-by: Piotr Gorski --- linux-tkg-config/prepare | 4 - linux-tkg-patches/6.6/0003-eevdf.patch | 1394 ----------------- .../0004-eevdf-Disable-DELAY_DEQUEUE.patch | 26 - 3 files changed, 1424 deletions(-) delete mode 100644 linux-tkg-patches/6.6/0003-eevdf.patch delete mode 100644 linux-tkg-patches/6.6/0004-eevdf-Disable-DELAY_DEQUEUE.patch diff --git a/linux-tkg-config/prepare b/linux-tkg-config/prepare index dc47e19..ecb3b24 100644 --- a/linux-tkg-config/prepare +++ b/linux-tkg-config/prepare @@ -847,10 +847,6 @@ _tkg_srcprep() { tkgpatch="$srcdir/tt_high_hz.patch" && _tkg_patcher fi elif [ "${_cpusched}" = "bore" ]; then - if [[ $_kver = 606 ]]; then - _msg="Applying eevdf \"additions\" patch" - tkgpatch="$srcdir/0003-eevdf.patch" && _tkg_patcher - fi _msg="Applying BORE patch" curl "https://raw.githubusercontent.com/CachyOS/kernel-patches/master/${_basekernel}/sched/0001-bore.patch" > "$srcdir"/0001-bore.patch tkgpatch="$srcdir/0001-bore.patch" && _tkg_patcher diff --git a/linux-tkg-patches/6.6/0003-eevdf.patch b/linux-tkg-patches/6.6/0003-eevdf.patch deleted file mode 100644 index f48058c..0000000 --- a/linux-tkg-patches/6.6/0003-eevdf.patch +++ /dev/null @@ -1,1394 +0,0 @@ -From b4e32953bb6b22b413ac442ec155e8a7660f90a3 Mon Sep 17 00:00:00 2001 -From: Ingo Molnar -Date: Tue, 19 Sep 2023 10:31:15 +0200 -Subject: [PATCH 01/11] sched/fair: Rename check_preempt_wakeup() to - check_preempt_wakeup_fair() - -Other scheduling classes already postfix their similar methods -with the class name. - -Signed-off-by: Ingo Molnar -Acked-by: Peter Zijlstra (Intel) ---- - kernel/sched/fair.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index df348aa55..ab1ad125c 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -8052,7 +8052,7 @@ static void set_next_buddy(struct sched_entity *se) - /* - * Preempt the current task with a newly woken task if needed: - */ --static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) - { - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; -@@ -12908,7 +12908,7 @@ DEFINE_SCHED_CLASS(fair) = { - .yield_task = yield_task_fair, - .yield_to_task = yield_to_task_fair, - -- .check_preempt_curr = check_preempt_wakeup, -+ .check_preempt_curr = check_preempt_wakeup_fair, - - .pick_next_task = __pick_next_task_fair, - .put_prev_task = put_prev_task_fair, --- -2.42.0 - - -From f5ca136342a997ec670cc0276b23cd3dc892d708 Mon Sep 17 00:00:00 2001 -From: Ingo Molnar -Date: Tue, 19 Sep 2023 10:38:21 +0200 -Subject: [PATCH 02/11] sched/fair: Rename check_preempt_curr() to - wakeup_preempt() - -The name is a bit opaque - make it clear that this is about wakeup -preemption. - -Also rename the ->check_preempt_curr() methods similarly. - -Signed-off-by: Ingo Molnar -Acked-by: Peter Zijlstra (Intel) ---- - kernel/sched/core.c | 14 +++++++------- - kernel/sched/deadline.c | 10 +++++----- - kernel/sched/fair.c | 10 +++++----- - kernel/sched/idle.c | 4 ++-- - kernel/sched/rt.c | 6 +++--- - kernel/sched/sched.h | 4 ++-- - kernel/sched/stop_task.c | 4 ++-- - 7 files changed, 26 insertions(+), 26 deletions(-) - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 802551e00..28768a3b2 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -2218,10 +2218,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, - p->sched_class->prio_changed(rq, p, oldprio); - } - --void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) -+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) - { - if (p->sched_class == rq->curr->sched_class) -- rq->curr->sched_class->check_preempt_curr(rq, p, flags); -+ rq->curr->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, rq->curr->sched_class)) - resched_curr(rq); - -@@ -2527,7 +2527,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, - rq_lock(rq, rf); - WARN_ON_ONCE(task_cpu(p) != new_cpu); - activate_task(rq, p, 0); -- check_preempt_curr(rq, p, 0); -+ wakeup_preempt(rq, p, 0); - - return rq; - } -@@ -3409,7 +3409,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) - deactivate_task(src_rq, p, 0); - set_task_cpu(p, cpu); - activate_task(dst_rq, p, 0); -- check_preempt_curr(dst_rq, p, 0); -+ wakeup_preempt(dst_rq, p, 0); - - rq_unpin_lock(dst_rq, &drf); - rq_unpin_lock(src_rq, &srf); -@@ -3785,7 +3785,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, - } - - activate_task(rq, p, en_flags); -- check_preempt_curr(rq, p, wake_flags); -+ wakeup_preempt(rq, p, wake_flags); - - ttwu_do_wakeup(p); - -@@ -3856,7 +3856,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) - * it should preempt the task that is current now. - */ - update_rq_clock(rq); -- check_preempt_curr(rq, p, wake_flags); -+ wakeup_preempt(rq, p, wake_flags); - } - ttwu_do_wakeup(p); - ret = 1; -@@ -4871,7 +4871,7 @@ void wake_up_new_task(struct task_struct *p) - - activate_task(rq, p, ENQUEUE_NOCLOCK); - trace_sched_wakeup_new(p); -- check_preempt_curr(rq, p, WF_FORK); -+ wakeup_preempt(rq, p, WF_FORK); - #ifdef CONFIG_SMP - if (p->sched_class->task_woken) { - /* -diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c -index 58b542bf2..fb1996a67 100644 ---- a/kernel/sched/deadline.c -+++ b/kernel/sched/deadline.c -@@ -763,7 +763,7 @@ static inline void deadline_queue_pull_task(struct rq *rq) - - static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); - static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); --static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); -+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); - - static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, - struct rq *rq) -@@ -1175,7 +1175,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) - - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) -- check_preempt_curr_dl(rq, p, 0); -+ wakeup_preempt_dl(rq, p, 0); - else - resched_curr(rq); - -@@ -1939,7 +1939,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - * Only called when both the current and waking task are -deadline - * tasks. - */ --static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, -+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, - int flags) - { - if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { -@@ -2652,7 +2652,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) - deadline_queue_push_tasks(rq); - #endif - if (dl_task(rq->curr)) -- check_preempt_curr_dl(rq, p, 0); -+ wakeup_preempt_dl(rq, p, 0); - else - resched_curr(rq); - } else { -@@ -2721,7 +2721,7 @@ DEFINE_SCHED_CLASS(dl) = { - .dequeue_task = dequeue_task_dl, - .yield_task = yield_task_dl, - -- .check_preempt_curr = check_preempt_curr_dl, -+ .wakeup_preempt = wakeup_preempt_dl, - - .pick_next_task = pick_next_task_dl, - .put_prev_task = put_prev_task_dl, -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index ab1ad125c..785e8611e 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -8065,7 +8065,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int - - /* - * This is possible from callers such as attach_tasks(), in which we -- * unconditionally check_preempt_curr() after an enqueue (which may have -+ * unconditionally wakeup_preempt() after an enqueue (which may have - * lead to a throttle). This both saves work and prevents false - * next-buddy nomination below. - */ -@@ -8972,7 +8972,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) - - WARN_ON_ONCE(task_rq(p) != rq); - activate_task(rq, p, ENQUEUE_NOCLOCK); -- check_preempt_curr(rq, p, 0); -+ wakeup_preempt(rq, p, 0); - } - - /* -@@ -12447,7 +12447,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) - if (p->prio > oldprio) - resched_curr(rq); - } else -- check_preempt_curr(rq, p, 0); -+ wakeup_preempt(rq, p, 0); - } - - #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -12549,7 +12549,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) - if (task_current(rq, p)) - resched_curr(rq); - else -- check_preempt_curr(rq, p, 0); -+ wakeup_preempt(rq, p, 0); - } - } - -@@ -12908,7 +12908,7 @@ DEFINE_SCHED_CLASS(fair) = { - .yield_task = yield_task_fair, - .yield_to_task = yield_to_task_fair, - -- .check_preempt_curr = check_preempt_wakeup_fair, -+ .wakeup_preempt = check_preempt_wakeup_fair, - - .pick_next_task = __pick_next_task_fair, - .put_prev_task = put_prev_task_fair, -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index 5007b25c5..565f8374d 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -401,7 +401,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - /* - * Idle tasks are unconditionally rescheduled: - */ --static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) -+static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) - { - resched_curr(rq); - } -@@ -482,7 +482,7 @@ DEFINE_SCHED_CLASS(idle) = { - /* dequeue is not valid, we print a debug message there: */ - .dequeue_task = dequeue_task_idle, - -- .check_preempt_curr = check_preempt_curr_idle, -+ .wakeup_preempt = wakeup_preempt_idle, - - .pick_next_task = pick_next_task_idle, - .put_prev_task = put_prev_task_idle, -diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c -index 0597ba0f8..3e442fa3f 100644 ---- a/kernel/sched/rt.c -+++ b/kernel/sched/rt.c -@@ -953,7 +953,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) - - /* - * When we're idle and a woken (rt) task is -- * throttled check_preempt_curr() will set -+ * throttled wakeup_preempt() will set - * skip_update and the time between the wakeup - * and this unthrottle will get accounted as - * 'runtime'. -@@ -1715,7 +1715,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - /* - * Preempt the current task with a newly woken task if needed: - */ --static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) -+static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) - { - if (p->prio < rq->curr->prio) { - resched_curr(rq); -@@ -2702,7 +2702,7 @@ DEFINE_SCHED_CLASS(rt) = { - .dequeue_task = dequeue_task_rt, - .yield_task = yield_task_rt, - -- .check_preempt_curr = check_preempt_curr_rt, -+ .wakeup_preempt = wakeup_preempt_rt, - - .pick_next_task = pick_next_task_rt, - .put_prev_task = put_prev_task_rt, -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 048462724..c3ca10d02 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2239,7 +2239,7 @@ struct sched_class { - void (*yield_task) (struct rq *rq); - bool (*yield_to_task)(struct rq *rq, struct task_struct *p); - -- void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); -+ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); - - struct task_struct *(*pick_next_task)(struct rq *rq); - -@@ -2513,7 +2513,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); - extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); - --extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); -+extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); - - #ifdef CONFIG_PREEMPT_RT - #define SCHED_NR_MIGRATE_BREAK 8 -diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c -index 85590599b..6cf7304e6 100644 ---- a/kernel/sched/stop_task.c -+++ b/kernel/sched/stop_task.c -@@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - #endif /* CONFIG_SMP */ - - static void --check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) -+wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) - { - /* we're never preempted */ - } -@@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = { - .dequeue_task = dequeue_task_stop, - .yield_task = yield_task_stop, - -- .check_preempt_curr = check_preempt_curr_stop, -+ .wakeup_preempt = wakeup_preempt_stop, - - .pick_next_task = pick_next_task_stop, - .put_prev_task = put_prev_task_stop, --- -2.42.0 - - -From 685dbae13ab75814a3706a3741db931d7d56bb8e Mon Sep 17 00:00:00 2001 -From: Sebastian Andrzej Siewior -Date: Wed, 20 Sep 2023 15:00:24 +0200 -Subject: [PATCH 03/11] sched/debug: Remove the - /proc/sys/kernel/sched_child_runs_first sysctl - -The /proc/sys/kernel/sched_child_runs_first knob is no longer connected since: - - 5e963f2bd4654 ("sched/fair: Commit to EEVDF") - -Remove it. - -Signed-off-by: Sebastian Andrzej Siewior -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230920130025.412071-2-bigeasy@linutronix.de ---- - kernel/sched/debug.c | 1 - - kernel/sched/fair.c | 13 ------------- - kernel/sched/sched.h | 2 -- - 3 files changed, 16 deletions(-) - -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 4c3d0d9f3..132dfd1e6 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -864,7 +864,6 @@ static void sched_debug_header(struct seq_file *m) - #define PN(x) \ - SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) - PN(sysctl_sched_base_slice); -- P(sysctl_sched_child_runs_first); - P(sysctl_sched_features); - #undef PN - #undef P -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 785e8611e..829817ef6 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -78,12 +78,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; - unsigned int sysctl_sched_base_slice = 750000ULL; - static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; - --/* -- * After fork, child runs first. If set to 0 (default) then -- * parent will (try to) run first. -- */ --unsigned int sysctl_sched_child_runs_first __read_mostly; -- - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; - - int sched_thermal_decay_shift; -@@ -145,13 +139,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; - - #ifdef CONFIG_SYSCTL - static struct ctl_table sched_fair_sysctls[] = { -- { -- .procname = "sched_child_runs_first", -- .data = &sysctl_sched_child_runs_first, -- .maxlen = sizeof(unsigned int), -- .mode = 0644, -- .proc_handler = proc_dointvec, -- }, - #ifdef CONFIG_CFS_BANDWIDTH - { - .procname = "sched_cfs_bandwidth_slice_us", -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index c3ca10d02..7fc32a87f 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -109,8 +109,6 @@ extern __read_mostly int scheduler_running; - extern unsigned long calc_load_update; - extern atomic_long_t calc_load_tasks; - --extern unsigned int sysctl_sched_child_runs_first; -- - extern void calc_global_load_tick(struct rq *this_rq); - extern long calc_load_fold_active(struct rq *this_rq, long adjust); - --- -2.42.0 - - -From 7ad030f0be19209fd944ace9be02bcc0d7b4962a Mon Sep 17 00:00:00 2001 -From: Yiwei Lin -Date: Fri, 20 Oct 2023 13:56:17 +0800 -Subject: [PATCH 04/11] sched/fair: Remove unused 'curr' argument from - pick_next_entity() - -The 'curr' argument of pick_next_entity() has become unused after -the EEVDF changes. - -[ mingo: Updated the changelog. ] - -Signed-off-by: Yiwei Lin -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20231020055617.42064-1-s921975628@gmail.com ---- - kernel/sched/fair.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 829817ef6..a30c03cc2 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -5254,7 +5254,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - * 4) do not run the "skip" process, if something else is available - */ - static struct sched_entity * --pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) -+pick_next_entity(struct cfs_rq *cfs_rq) - { - /* - * Enabling NEXT_BUDDY will affect latency but not fairness. -@@ -8144,7 +8144,7 @@ static struct task_struct *pick_task_fair(struct rq *rq) - goto again; - } - -- se = pick_next_entity(cfs_rq, curr); -+ se = pick_next_entity(cfs_rq); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - -@@ -8207,7 +8207,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf - } - } - -- se = pick_next_entity(cfs_rq, curr); -+ se = pick_next_entity(cfs_rq); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - -@@ -8246,7 +8246,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf - put_prev_task(rq, prev); - - do { -- se = pick_next_entity(cfs_rq, NULL); -+ se = pick_next_entity(cfs_rq); - set_next_entity(cfs_rq, se); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); --- -2.42.0 - - -From 3745fd9933ee1ca1d4dc4dde0a39c2da275c110f Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Sat, 14 Oct 2023 23:12:20 +0200 -Subject: [PATCH 05/11] sched/eevdf: Add feature comments - -Signed-off-by: Peter Zijlstra (Intel) ---- - kernel/sched/features.h | 7 +++++++ - 1 file changed, 7 insertions(+) - -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index f77016823..ef91684d8 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -5,7 +5,14 @@ - * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. - */ - SCHED_FEAT(PLACE_LAG, true) -+/* -+ * Give new tasks half a slice to ease into the competition. -+ */ - SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) -+/* -+ * Inhibit (wakeup) preemption until the current task has either matched the -+ * 0-lag point or until is has exhausted it's slice. -+ */ - SCHED_FEAT(RUN_TO_PARITY, true) - - /* --- -2.42.0 - - -From 1346e28f7dfb59207364f366e4235ee7304150e8 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 4 Oct 2023 12:43:53 +0200 -Subject: [PATCH 06/11] sched/eevdf: Remove min_vruntime_copy - -Since commit e8f331bcc270 ("sched/smp: Use lag to simplify -cross-runqueue placement") the min_vruntime_copy is no longer used. - -Signed-off-by: Peter Zijlstra (Intel) ---- - kernel/sched/fair.c | 5 ++--- - kernel/sched/sched.h | 4 ---- - 2 files changed, 2 insertions(+), 7 deletions(-) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index a30c03cc2..c118381f0 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -774,8 +774,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) - } - - /* ensure we never gain time by being placed backwards. */ -- u64_u32_store(cfs_rq->min_vruntime, -- __update_min_vruntime(cfs_rq, vruntime)); -+ cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); - } - - static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) -@@ -12571,7 +12570,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) - void init_cfs_rq(struct cfs_rq *cfs_rq) - { - cfs_rq->tasks_timeline = RB_ROOT_CACHED; -- u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); -+ cfs_rq->min_vruntime = (u64)(-(1LL << 20)); - #ifdef CONFIG_SMP - raw_spin_lock_init(&cfs_rq->removed.lock); - #endif -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 7fc32a87f..f447bfbb1 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -558,10 +558,6 @@ struct cfs_rq { - u64 min_vruntime_fi; - #endif - --#ifndef CONFIG_64BIT -- u64 min_vruntime_copy; --#endif -- - struct rb_root_cached tasks_timeline; - - /* --- -2.42.0 - - -From a707549ff96d3887c6cd6f4adbed195f8f01ab34 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Mon, 22 May 2023 13:46:30 +0200 -Subject: [PATCH 07/11] sched/eevdf: Use sched_attr::sched_runtime to set - request/slice suggestion - -Allow applications to directly set a suggested request/slice length using -sched_attr::sched_runtime. - -The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms] -which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100. - -Applications should strive to use their periodic runtime at a high -confidence interval (95%+) as the target slice. Using a smaller slice -will introduce undue preemptions, while using a larger value will -increase latency. - -For all the following examples assume a scheduling quantum of 8, and for -consistency all examples have W=4: - - {A,B,C,D}(w=1,r=8): - - ABCD... - +---+---+---+--- - - t=0, V=1.5 t=1, V=3.5 - A |------< A |------< - B |------< B |------< - C |------< C |------< - D |------< D |------< - ---+*------+-------+--- ---+--*----+-------+--- - - t=2, V=5.5 t=3, V=7.5 - A |------< A |------< - B |------< B |------< - C |------< C |------< - D |------< D |------< - ---+----*--+-------+--- ---+------*+-------+--- - -Note: 4 identical tasks in FIFO order - -~~~ - - {A,B}(w=1,r=16) C(w=2,r=16) - - AACCBBCC... - +---+---+---+--- - - t=0, V=1.25 t=2, V=5.25 - A |--------------< A |--------------< - B |--------------< B |--------------< - C |------< C |------< - ---+*------+-------+--- ---+----*--+-------+--- - - t=4, V=8.25 t=6, V=12.25 - A |--------------< A |--------------< - B |--------------< B |--------------< - C |------< C |------< - ---+-------*-------+--- ---+-------+---*---+--- - -Note: 1 heavy task -- because q=8, double r such that the deadline of the w=2 - task doesn't go below q. - -Note: observe the full schedule becomes: W*max(r_i/w_i) = 4*2q = 8q in length. - -Note: the period of the heavy task is half the full period at: - W*(r_i/w_i) = 4*(2q/2) = 4q - -~~~ - - {A,C,D}(w=1,r=16) B(w=1,r=8): - - BAACCBDD... - +---+---+---+--- - - t=0, V=1.5 t=1, V=3.5 - A |--------------< A |---------------< - B |------< B |------< - C |--------------< C |--------------< - D |--------------< D |--------------< - ---+*------+-------+--- ---+--*----+-------+--- - - t=3, V=7.5 t=5, V=11.5 - A |---------------< A |---------------< - B |------< B |------< - C |--------------< C |--------------< - D |--------------< D |--------------< - ---+------*+-------+--- ---+-------+--*----+--- - - t=6, V=13.5 - A |---------------< - B |------< - C |--------------< - D |--------------< - ---+-------+----*--+--- - -Note: 1 short task -- again double r so that the deadline of the short task - won't be below q. Made B short because its not the leftmost task, but is - eligible with the 0,1,2,3 spread. - -Note: like with the heavy task, the period of the short task observes: - W*(r_i/w_i) = 4*(1q/1) = 4q - -~~~ - - A(w=1,r=16) B(w=1,r=8) C(w=2,r=16) - - BCCAABCC... - +---+---+---+--- - - t=0, V=1.25 t=1, V=3.25 - A |--------------< A |--------------< - B |------< B |------< - C |------< C |------< - ---+*------+-------+--- ---+--*----+-------+--- - - t=3, V=7.25 t=5, V=11.25 - A |--------------< A |--------------< - B |------< B |------< - C |------< C |------< - ---+------*+-------+--- ---+-------+--*----+--- - - t=6, V=13.25 - A |--------------< - B |------< - C |------< - ---+-------+----*--+--- - -Note: 1 heavy and 1 short task -- combine them all. - -Note: both the short and heavy task end up with a period of 4q - -~~~ - - A(w=1,r=16) B(w=2,r=16) C(w=1,r=8) - - BBCAABBC... - +---+---+---+--- - - t=0, V=1 t=2, V=5 - A |--------------< A |--------------< - B |------< B |------< - C |------< C |------< - ---+*------+-------+--- ---+----*--+-------+--- - - t=3, V=7 t=5, V=11 - A |--------------< A |--------------< - B |------< B |------< - C |------< C |------< - ---+------*+-------+--- ---+-------+--*----+--- - - t=7, V=15 - A |--------------< - B |------< - C |------< - ---+-------+------*+--- - -Note: as before but permuted - -~~~ - -From all this it can be deduced that, for the steady state: - - - the total period (P) of a schedule is: W*max(r_i/w_i) - - the average period of a task is: W*(r_i/w_i) - - each task obtains the fair share: w_i/W of each full period P - -Signed-off-by: Peter Zijlstra (Intel) ---- - include/linux/sched.h | 3 +++ - kernel/sched/core.c | 33 ++++++++++++++++++++++++++------- - kernel/sched/debug.c | 3 ++- - kernel/sched/fair.c | 6 ++++-- - 4 files changed, 35 insertions(+), 10 deletions(-) - -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 77f01ac38..e90e58b2c 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -555,6 +555,9 @@ struct sched_entity { - struct list_head group_node; - unsigned int on_rq; - -+ unsigned int custom_slice : 1; -+ /* 31 bits hole */ -+ - u64 exec_start; - u64 sum_exec_runtime; - u64 prev_sum_exec_runtime; -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 28768a3b2..d914ec370 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -4501,7 +4501,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) - p->se.nr_migrations = 0; - p->se.vruntime = 0; - p->se.vlag = 0; -- p->se.slice = sysctl_sched_base_slice; - INIT_LIST_HEAD(&p->se.group_node); - - #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -4755,6 +4754,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - - p->prio = p->normal_prio = p->static_prio; - set_load_weight(p, false); -+ p->se.custom_slice = 0; -+ p->se.slice = sysctl_sched_base_slice; - - /* - * We don't need the reset flag anymore after the fork. It has -@@ -7523,10 +7524,20 @@ static void __setscheduler_params(struct task_struct *p, - - p->policy = policy; - -- if (dl_policy(policy)) -+ if (dl_policy(policy)) { - __setparam_dl(p, attr); -- else if (fair_policy(policy)) -+ } else if (fair_policy(policy)) { - p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ if (attr->sched_runtime) { -+ p->se.custom_slice = 1; -+ p->se.slice = clamp_t(u64, attr->sched_runtime, -+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ -+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */ -+ } else { -+ p->se.custom_slice = 0; -+ p->se.slice = sysctl_sched_base_slice; -+ } -+ } - - /* - * __sched_setscheduler() ensures attr->sched_priority == 0 when -@@ -7711,7 +7722,9 @@ static int __sched_setscheduler(struct task_struct *p, - * but store a possible modification of reset_on_fork. - */ - if (unlikely(policy == p->policy)) { -- if (fair_policy(policy) && attr->sched_nice != task_nice(p)) -+ if (fair_policy(policy) && -+ (attr->sched_nice != task_nice(p) || -+ (attr->sched_runtime && attr->sched_runtime != p->se.slice))) - goto change; - if (rt_policy(policy) && attr->sched_priority != p->rt_priority) - goto change; -@@ -7857,6 +7870,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy, - .sched_nice = PRIO_TO_NICE(p->static_prio), - }; - -+ if (p->se.custom_slice) -+ attr.sched_runtime = p->se.slice; -+ - /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ - if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { - attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -@@ -8033,12 +8049,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a - - static void get_params(struct task_struct *p, struct sched_attr *attr) - { -- if (task_has_dl_policy(p)) -+ if (task_has_dl_policy(p)) { - __getparam_dl(p, attr); -- else if (task_has_rt_policy(p)) -+ } else if (task_has_rt_policy(p)) { - attr->sched_priority = p->rt_priority; -- else -+ } else { - attr->sched_nice = task_nice(p); -+ attr->sched_runtime = p->se.slice; -+ } - } - - /** -@@ -10057,6 +10075,7 @@ void __init sched_init(void) - } - - set_load_weight(&init_task, false); -+ init_task.se.slice = sysctl_sched_base_slice, - - /* - * The boot idle thread does lazy MMU switching as well: -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 132dfd1e6..762c899df 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -579,11 +579,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) - else - SEQ_printf(m, " %c", task_state_to_char(p)); - -- SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", -+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", - p->comm, task_pid_nr(p), - SPLIT_NS(p->se.vruntime), - entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', - SPLIT_NS(p->se.deadline), -+ p->se.custom_slice ? 'S' : ' ', - SPLIT_NS(p->se.slice), - SPLIT_NS(p->se.sum_exec_runtime), - (long long)(p->nvcsw + p->nivcsw), -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index c118381f0..a2cb468a9 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -1012,7 +1012,8 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) - * nice) while the request time r_i is determined by - * sysctl_sched_base_slice. - */ -- se->slice = sysctl_sched_base_slice; -+ if (!se->custom_slice) -+ se->slice = sysctl_sched_base_slice; - - /* - * EEVDF: vd_i = ve_i + r_i / w_i -@@ -4962,7 +4963,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - u64 vslice, vruntime = avg_vruntime(cfs_rq); - s64 lag = 0; - -- se->slice = sysctl_sched_base_slice; -+ if (!se->custom_slice) -+ se->slice = sysctl_sched_base_slice; - vslice = calc_delta_fair(se->slice, se); - - /* --- -2.42.0 - - -From 27a6fd2cdb4818ab014d744d6b28a10e2ba3c7b0 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 26 Sep 2023 14:32:32 +0200 -Subject: [PATCH 08/11] sched/eevdf: Allow shorter slices to wakeup-preempt - -Part of the reason to have shorter slices is to improve -responsiveness. Allow shorter slices to preempt longer slices on -wakeup. - - Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Sum delay ms | - - 100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT - - 1 massive_intr:(5) | 846018.956 ms | 779188 | avg: 0.273 ms | max: 58.337 ms | sum:212545.245 ms | - 2 massive_intr:(5) | 853450.693 ms | 792269 | avg: 0.275 ms | max: 71.193 ms | sum:218263.588 ms | - 3 massive_intr:(5) | 843888.920 ms | 771456 | avg: 0.277 ms | max: 92.405 ms | sum:213353.221 ms | - 1 chromium-browse:(8) | 53015.889 ms | 131766 | avg: 0.463 ms | max: 36.341 ms | sum:60959.230 ms | - 2 chromium-browse:(8) | 53864.088 ms | 136962 | avg: 0.480 ms | max: 27.091 ms | sum:65687.681 ms | - 3 chromium-browse:(9) | 53637.904 ms | 132637 | avg: 0.481 ms | max: 24.756 ms | sum:63781.673 ms | - 1 cyclictest:(5) | 12615.604 ms | 639689 | avg: 0.471 ms | max: 32.272 ms | sum:301351.094 ms | - 2 cyclictest:(5) | 12511.583 ms | 642578 | avg: 0.448 ms | max: 44.243 ms | sum:287632.830 ms | - 3 cyclictest:(5) | 12545.867 ms | 635953 | avg: 0.475 ms | max: 25.530 ms | sum:302374.658 ms | - - 100ms massive_intr 500us cyclictest PREEMPT_SHORT - - 1 massive_intr:(5) | 839843.919 ms | 837384 | avg: 0.264 ms | max: 74.366 ms | sum:221476.885 ms | - 2 massive_intr:(5) | 852449.913 ms | 845086 | avg: 0.252 ms | max: 68.162 ms | sum:212595.968 ms | - 3 massive_intr:(5) | 839180.725 ms | 836883 | avg: 0.266 ms | max: 69.742 ms | sum:222812.038 ms | - 1 chromium-browse:(11) | 54591.481 ms | 138388 | avg: 0.458 ms | max: 35.427 ms | sum:63401.508 ms | - 2 chromium-browse:(8) | 52034.541 ms | 132276 | avg: 0.436 ms | max: 31.826 ms | sum:57732.958 ms | - 3 chromium-browse:(8) | 55231.771 ms | 141892 | avg: 0.469 ms | max: 27.607 ms | sum:66538.697 ms | - 1 cyclictest:(5) | 13156.391 ms | 667412 | avg: 0.373 ms | max: 38.247 ms | sum:249174.502 ms | - 2 cyclictest:(5) | 12688.939 ms | 665144 | avg: 0.374 ms | max: 33.548 ms | sum:248509.392 ms | - 3 cyclictest:(5) | 13475.623 ms | 669110 | avg: 0.370 ms | max: 37.819 ms | sum:247673.390 ms | - -As per the numbers the, this makes cyclictest (short slice) it's -max-delay more consistent and consistency drops the sum-delay. The -trade-off is that the massive_intr (long slice) gets more context -switches and a slight increase in sum-delay. - -[mike: numbers] -Signed-off-by: Peter Zijlstra (Intel) -Tested-by: Mike Galbraith ---- - kernel/sched/fair.c | 11 ++++++++--- - kernel/sched/features.h | 4 ++++ - 2 files changed, 12 insertions(+), 3 deletions(-) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index a2cb468a9..969bea908 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -8108,9 +8108,14 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int - cfs_rq = cfs_rq_of(se); - update_curr(cfs_rq); - -- /* -- * XXX pick_eevdf(cfs_rq) != se ? -- */ -+ if (sched_feat(PREEMPT_SHORT) && pse->slice < se->slice && -+ entity_eligible(cfs_rq, pse) && -+ (s64)(pse->deadline - se->deadline) < 0 && -+ se->vlag == se->deadline) { -+ /* negate RUN_TO_PARITY */ -+ se->vlag = se->deadline - 1; -+ } -+ - if (pick_eevdf(cfs_rq) == pse) - goto preempt; - -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index ef91684d8..35428a3b8 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -14,6 +14,10 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) - * 0-lag point or until is has exhausted it's slice. - */ - SCHED_FEAT(RUN_TO_PARITY, true) -+/* -+ * Allow tasks with a shorter slice to disregard RUN_TO_PARITY -+ */ -+SCHED_FEAT(PREEMPT_SHORT, true) - - /* - * Prefer to schedule the task we woke last (assuming it failed --- -2.42.0 - - -From 01679ff43f2fa3f41694107816af6aeeba5d9fb2 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 26 Sep 2023 14:39:41 +0200 -Subject: [PATCH 09/11] sched/eevdf: Revenge of the Sith^WSleeper - -For tasks that have received excess service (negative lag) allow them -to gain parity (zero lag) by sleeping. - - slice 30000000 (*10) - # Min Latencies: 00041 - # Avg Latencies: 00712 - # Max Latencies: 287353 - - slice 3000000 (default) - # Min Latencies: 00054 - # Avg Latencies: 00436 - # Max Latencies: 23531 - - slice 300000 (/10) - # Min Latencies: 00054 - # Avg Latencies: 00061 - # Max Latencies: 05245 - -It sucks for many other things though... so let it be an experiment. - -Signed-off-by: Peter Zijlstra (Intel) ---- - kernel/sched/fair.c | 36 ++++++++++++++++++++++++++++++++++++ - kernel/sched/features.h | 6 ++++++ - 2 files changed, 42 insertions(+) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 969bea908..b148a654d 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -4957,6 +4957,33 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - - #endif /* CONFIG_SMP */ - -+static inline u64 -+entity_vlag_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -+{ -+ u64 now, vdelta; -+ s64 delta; -+ -+ if (!(flags & ENQUEUE_WAKEUP)) -+ return se->vlag; -+ -+ if (flags & ENQUEUE_MIGRATED) -+ return 0; -+ -+ now = rq_clock_task(rq_of(cfs_rq)); -+ delta = now - se->exec_start; -+ if (delta < 0) -+ return se->vlag; -+ -+ if (sched_feat(GENTLE_SLEEPER)) -+ delta /= 2; -+ -+ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load); -+ if (vdelta < -se->vlag) -+ return se->vlag + vdelta; -+ -+ return 0; -+} -+ - static void - place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - { -@@ -4981,6 +5008,15 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - - lag = se->vlag; - -+ /* -+ * Allow tasks that have received too much service (negative -+ * lag) to (re)gain parity (zero lag) by sleeping for the -+ * equivalent duration. This ensures they will be readily -+ * eligible. -+ */ -+ if (sched_feat(PLACE_SLEEPER) && lag < 0) -+ lag = entity_vlag_sleeper(cfs_rq, se, flags); -+ - /* - * If we want to place a task and preserve lag, we have to - * consider the effect of the new entity on the weighted -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 35428a3b8..926511713 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -18,6 +18,12 @@ SCHED_FEAT(RUN_TO_PARITY, true) - * Allow tasks with a shorter slice to disregard RUN_TO_PARITY - */ - SCHED_FEAT(PREEMPT_SHORT, true) -+/* -+ * Let sleepers earn back lag, but not more than 0-lag. GENTLE_SLEEPERS earn at -+ * half the speed. -+ */ -+SCHED_FEAT(PLACE_SLEEPER, false) -+SCHED_FEAT(GENTLE_SLEEPER, true) - - /* - * Prefer to schedule the task we woke last (assuming it failed --- -2.42.0 - - -From cec920740e1ee07076fad5585640f7089d0aff48 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Thu, 5 Oct 2023 15:30:13 +0200 -Subject: [PATCH 10/11] sched/eevdf: Disable entity_eligible() - -Disable entity_eligible() entirely, this makes tasks much easier to -pick, but also gives rise to degenerate cases like: - -t=92 V=16 - A |----< - B |< ->C |----------------< - D |< - E |< - F |< - G |< - |---------|-----*---|---------|---------|---- - -hence, default disable. - -Suggested-by: Youssef Esmat -Signed-off-by: Peter Zijlstra (Intel) ---- - kernel/sched/fair.c | 3 +++ - kernel/sched/features.h | 11 +++++++++++ - 2 files changed, 14 insertions(+) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index b148a654d..00e4a32a6 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -724,6 +724,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) - */ - int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) - { -+ if (sched_feat(EVDF)) -+ return true; -+ - struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 926511713..8c7887fd8 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -24,6 +24,17 @@ SCHED_FEAT(PREEMPT_SHORT, true) - */ - SCHED_FEAT(PLACE_SLEEPER, false) - SCHED_FEAT(GENTLE_SLEEPER, true) -+/* -+ * Disable the eligibility check -- always true. -+ * -+ * Selecting this allows short tasks, in the presence of a long task, to walk -+ * far past 0-lag and create a window where newly placed tasks will come in and -+ * starve the long task. -+ * -+ * Behaves quite terrible for mixed slice workloads as a result, very much not -+ * recommended. -+ */ -+SCHED_FEAT(EVDF, false) - - /* - * Prefer to schedule the task we woke last (assuming it failed --- -2.42.0 - - -From df2ccad31aa54f13bd882c0d9e943cd1a0adc12e Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Fri, 15 Sep 2023 00:48:45 +0200 -Subject: [PATCH 11/11] sched/eevdf: Delay dequeue - -For tasks that have negative-lag (have received 'excess' service), delay the -dequeue and keep them in the runnable tree until they're eligible again. Or -rather, keep them until they're selected again, since finding their eligibility -crossover point is expensive. - -The effect is a bit like sleeper bonus, the tasks keep contending for service -until either they get a wakeup or until they're selected again and are really -dequeued. - -This means that any actual dequeue happens with positive lag (serviced owed) -and are more readily ran when woken next. - -Signed-off-by: Peter Zijlstra (Intel) ---- - include/linux/sched.h | 1 + - kernel/sched/core.c | 88 +++++++++++++++++++++++++++++++++-------- - kernel/sched/fair.c | 11 ++++++ - kernel/sched/features.h | 11 ++++++ - kernel/sched/sched.h | 3 +- - 5 files changed, 97 insertions(+), 17 deletions(-) - -diff --git a/include/linux/sched.h b/include/linux/sched.h -index e90e58b2c..b6d834a47 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -894,6 +894,7 @@ struct task_struct { - unsigned sched_reset_on_fork:1; - unsigned sched_contributes_to_load:1; - unsigned sched_migrated:1; -+ unsigned sched_delayed:1; - - /* Force alignment to the next boundary: */ - unsigned :0; -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index d914ec370..cb5641bd8 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -3850,12 +3850,23 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) - - rq = __task_rq_lock(p, &rf); - if (task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ if (unlikely(p->sched_delayed)) { -+ p->sched_delayed = 0; -+ /* mustn't run a delayed task */ -+ WARN_ON_ONCE(task_on_cpu(rq, p)); -+ if (sched_feat(GENTLE_DELAY)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); -+ if (p->se.vlag > 0) -+ p->se.vlag = 0; -+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); -+ } -+ } - if (!task_on_cpu(rq, p)) { - /* - * When on_rq && !on_cpu the task is preempted, see if - * it should preempt the task that is current now. - */ -- update_rq_clock(rq); - wakeup_preempt(rq, p, wake_flags); - } - ttwu_do_wakeup(p); -@@ -6535,6 +6546,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - # define SM_MASK_PREEMPT SM_PREEMPT - #endif - -+static void deschedule_task(struct rq *rq, struct task_struct *p, unsigned long prev_state) -+{ -+ p->sched_contributes_to_load = -+ (prev_state & TASK_UNINTERRUPTIBLE) && -+ !(prev_state & TASK_NOLOAD) && -+ !(prev_state & TASK_FROZEN); -+ -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible++; -+ -+ deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); -+ -+ if (p->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+} -+ - /* - * __schedule() is the main scheduler function. - * -@@ -6619,6 +6648,8 @@ static void __sched notrace __schedule(unsigned int sched_mode) - - switch_count = &prev->nivcsw; - -+ WARN_ON_ONCE(prev->sched_delayed); -+ - /* - * We must load prev->state once (task_struct::state is volatile), such - * that we form a control dependency vs deactivate_task() below. -@@ -6628,14 +6659,6 @@ static void __sched notrace __schedule(unsigned int sched_mode) - if (signal_pending_state(prev_state, prev)) { - WRITE_ONCE(prev->__state, TASK_RUNNING); - } else { -- prev->sched_contributes_to_load = -- (prev_state & TASK_UNINTERRUPTIBLE) && -- !(prev_state & TASK_NOLOAD) && -- !(prev_state & TASK_FROZEN); -- -- if (prev->sched_contributes_to_load) -- rq->nr_uninterruptible++; -- - /* - * __schedule() ttwu() - * prev_state = prev->state; if (p->on_rq && ...) -@@ -6647,17 +6670,50 @@ static void __sched notrace __schedule(unsigned int sched_mode) - * - * After this, schedule() must not care about p->state any more. - */ -- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); -- -- if (prev->in_iowait) { -- atomic_inc(&rq->nr_iowait); -- delayacct_blkio_start(); -- } -+ if (sched_feat(DELAY_DEQUEUE) && -+ prev->sched_class->delay_dequeue_task && -+ prev->sched_class->delay_dequeue_task(rq, prev)) -+ prev->sched_delayed = 1; -+ else -+ deschedule_task(rq, prev, prev_state); - } - switch_count = &prev->nvcsw; - } - -- next = pick_next_task(rq, prev, &rf); -+ for (struct task_struct *tmp = prev;;) { -+ unsigned long tmp_state; -+ -+ next = pick_next_task(rq, tmp, &rf); -+ if (unlikely(tmp != prev)) -+ finish_task(tmp); -+ -+ if (likely(!next->sched_delayed)) -+ break; -+ -+ next->sched_delayed = 0; -+ -+ /* -+ * A sched_delayed task must not be runnable at this point, see -+ * ttwu_runnable(). -+ */ -+ tmp_state = READ_ONCE(next->__state); -+ if (WARN_ON_ONCE(!tmp_state)) -+ break; -+ -+ prepare_task(next); -+ /* -+ * Order ->on_cpu and ->on_rq, see the comments in -+ * try_to_wake_up(). Normally this is smp_mb__after_spinlock() -+ * above. -+ */ -+ smp_wmb(); -+ deschedule_task(rq, next, tmp_state); -+ if (sched_feat(GENTLE_DELAY) && next->se.vlag > 0) -+ next->se.vlag = 0; -+ -+ tmp = next; -+ } -+ - clear_tsk_need_resched(prev); - clear_preempt_need_resched(); - #ifdef CONFIG_SCHED_DEBUG -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 00e4a32a6..b25a6ad0c 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -8347,6 +8347,16 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq) - return pick_next_task_fair(rq, NULL, NULL); - } - -+static bool delay_dequeue_task_fair(struct rq *rq, struct task_struct *p) -+{ -+ struct sched_entity *se = &p->se; -+ struct cfs_rq *cfs_rq = cfs_rq_of(se); -+ -+ update_curr(cfs_rq); -+ -+ return !entity_eligible(cfs_rq, se); -+} -+ - /* - * Account for a descheduled task: - */ -@@ -12942,6 +12952,7 @@ DEFINE_SCHED_CLASS(fair) = { - - .wakeup_preempt = check_preempt_wakeup_fair, - -+ .delay_dequeue_task = delay_dequeue_task_fair, - .pick_next_task = __pick_next_task_fair, - .put_prev_task = put_prev_task_fair, - .set_next_task = set_next_task_fair, -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 8c7887fd8..82e01cd97 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -35,6 +35,17 @@ SCHED_FEAT(GENTLE_SLEEPER, true) - * recommended. - */ - SCHED_FEAT(EVDF, false) -+/* -+ * Delay dequeueing tasks until they get selected or woken. -+ * -+ * By delaying the dequeue for non-eligible tasks, they remain in the -+ * competition and can burn off their negative lag. When they get selected -+ * they'll have positive lag by definition. -+ * -+ * GENTLE_DELAY clips the lag on dequeue (or wakeup) to 0. -+ */ -+SCHED_FEAT(DELAY_DEQUEUE, true) -+SCHED_FEAT(GENTLE_DELAY, true) - - /* - * Prefer to schedule the task we woke last (assuming it failed -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index f447bfbb1..e6ea7b63d 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2235,6 +2235,7 @@ struct sched_class { - - void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); - -+ bool (*delay_dequeue_task)(struct rq *rq, struct task_struct *p); - struct task_struct *(*pick_next_task)(struct rq *rq); - - void (*put_prev_task)(struct rq *rq, struct task_struct *p); -@@ -2288,7 +2289,7 @@ struct sched_class { - - static inline void put_prev_task(struct rq *rq, struct task_struct *prev) - { -- WARN_ON_ONCE(rq->curr != prev); -+// WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev); - } - --- -2.42.0 - diff --git a/linux-tkg-patches/6.6/0004-eevdf-Disable-DELAY_DEQUEUE.patch b/linux-tkg-patches/6.6/0004-eevdf-Disable-DELAY_DEQUEUE.patch deleted file mode 100644 index 488d0ac..0000000 --- a/linux-tkg-patches/6.6/0004-eevdf-Disable-DELAY_DEQUEUE.patch +++ /dev/null @@ -1,26 +0,0 @@ -From 24cd48424c96118bbb0fc8af93bfdc98570c31bf Mon Sep 17 00:00:00 2001 -From: Piotr Gorski -Date: Wed, 25 Oct 2023 21:49:14 +0200 -Subject: [PATCH] EEVDF: Disable DELAY_DEQUEUE - -Signed-off-by: Piotr Gorski ---- - kernel/sched/features.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index fd2c963b7..6225be566 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -44,7 +44,7 @@ SCHED_FEAT(EVDF, false) - * - * GENTLE_DELAY clips the lag on dequeue (or wakeup) to 0. - */ --SCHED_FEAT(DELAY_DEQUEUE, true) -+SCHED_FEAT(DELAY_DEQUEUE, false) - SCHED_FEAT(GENTLE_DELAY, true) - - /* --- -2.42.0 -