4623 lines
143 KiB
Diff
4623 lines
143 KiB
Diff
From 75dc528ce438f0de9dc4488f3de7c03a5464a6a1 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 31 May 2023 13:58:40 +0200
|
|
Subject: [PATCH 01/28] sched/fair: Add cfs_rq::avg_vruntime
|
|
|
|
In order to move to an eligibility based scheduling policy, we need
|
|
to have a better approximation of the ideal scheduler.
|
|
|
|
Specifically, for a virtual time weighted fair queueing based
|
|
scheduler the ideal scheduler will be the weighted average of the
|
|
individual virtual runtimes (math in the comment).
|
|
|
|
As such, compute the weighted average to approximate the ideal
|
|
scheduler -- note that the approximation is in the individual task
|
|
behaviour, which isn't strictly conformant.
|
|
|
|
Specifically consider adding a task with a vruntime left of center, in
|
|
this case the average will move backwards in time -- something the
|
|
ideal scheduler would of course never do.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230531124603.654144274@infradead.org
|
|
---
|
|
kernel/sched/debug.c | 32 +++++-----
|
|
kernel/sched/fair.c | 137 ++++++++++++++++++++++++++++++++++++++++++-
|
|
kernel/sched/sched.h | 5 ++
|
|
3 files changed, 154 insertions(+), 20 deletions(-)
|
|
|
|
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
|
index 066ff1c8a..6d4c33402 100644
|
|
--- a/kernel/sched/debug.c
|
|
+++ b/kernel/sched/debug.c
|
|
@@ -626,10 +626,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
|
|
|
|
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
|
{
|
|
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
|
|
- spread, rq0_min_vruntime, spread0;
|
|
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
|
|
+ struct sched_entity *last, *first;
|
|
struct rq *rq = cpu_rq(cpu);
|
|
- struct sched_entity *last;
|
|
unsigned long flags;
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
@@ -643,26 +642,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
|
SPLIT_NS(cfs_rq->exec_clock));
|
|
|
|
raw_spin_rq_lock_irqsave(rq, flags);
|
|
- if (rb_first_cached(&cfs_rq->tasks_timeline))
|
|
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
|
|
+ first = __pick_first_entity(cfs_rq);
|
|
+ if (first)
|
|
+ left_vruntime = first->vruntime;
|
|
last = __pick_last_entity(cfs_rq);
|
|
if (last)
|
|
- max_vruntime = last->vruntime;
|
|
+ right_vruntime = last->vruntime;
|
|
min_vruntime = cfs_rq->min_vruntime;
|
|
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
|
|
raw_spin_rq_unlock_irqrestore(rq, flags);
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
|
|
- SPLIT_NS(MIN_vruntime));
|
|
+
|
|
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
|
|
+ SPLIT_NS(left_vruntime));
|
|
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
|
|
SPLIT_NS(min_vruntime));
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
|
|
- SPLIT_NS(max_vruntime));
|
|
- spread = max_vruntime - MIN_vruntime;
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
|
|
- SPLIT_NS(spread));
|
|
- spread0 = min_vruntime - rq0_min_vruntime;
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
|
|
- SPLIT_NS(spread0));
|
|
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
|
|
+ SPLIT_NS(avg_vruntime(cfs_rq)));
|
|
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
|
|
+ SPLIT_NS(right_vruntime));
|
|
+ spread = right_vruntime - left_vruntime;
|
|
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
|
|
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
|
|
cfs_rq->nr_spread_over);
|
|
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 1d9c2482c..30587ec12 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -601,9 +601,134 @@ static inline bool entity_before(const struct sched_entity *a,
|
|
return (s64)(a->vruntime - b->vruntime) < 0;
|
|
}
|
|
|
|
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ return (s64)(se->vruntime - cfs_rq->min_vruntime);
|
|
+}
|
|
+
|
|
#define __node_2_se(node) \
|
|
rb_entry((node), struct sched_entity, run_node)
|
|
|
|
+/*
|
|
+ * Compute virtual time from the per-task service numbers:
|
|
+ *
|
|
+ * Fair schedulers conserve lag:
|
|
+ *
|
|
+ * \Sum lag_i = 0
|
|
+ *
|
|
+ * Where lag_i is given by:
|
|
+ *
|
|
+ * lag_i = S - s_i = w_i * (V - v_i)
|
|
+ *
|
|
+ * Where S is the ideal service time and V is it's virtual time counterpart.
|
|
+ * Therefore:
|
|
+ *
|
|
+ * \Sum lag_i = 0
|
|
+ * \Sum w_i * (V - v_i) = 0
|
|
+ * \Sum w_i * V - w_i * v_i = 0
|
|
+ *
|
|
+ * From which we can solve an expression for V in v_i (which we have in
|
|
+ * se->vruntime):
|
|
+ *
|
|
+ * \Sum v_i * w_i \Sum v_i * w_i
|
|
+ * V = -------------- = --------------
|
|
+ * \Sum w_i W
|
|
+ *
|
|
+ * Specifically, this is the weighted average of all entity virtual runtimes.
|
|
+ *
|
|
+ * [[ NOTE: this is only equal to the ideal scheduler under the condition
|
|
+ * that join/leave operations happen at lag_i = 0, otherwise the
|
|
+ * virtual time has non-continguous motion equivalent to:
|
|
+ *
|
|
+ * V +-= lag_i / W
|
|
+ *
|
|
+ * Also see the comment in place_entity() that deals with this. ]]
|
|
+ *
|
|
+ * However, since v_i is u64, and the multiplcation could easily overflow
|
|
+ * transform it into a relative form that uses smaller quantities:
|
|
+ *
|
|
+ * Substitute: v_i == (v_i - v0) + v0
|
|
+ *
|
|
+ * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
|
|
+ * V = ---------------------------- = --------------------- + v0
|
|
+ * W W
|
|
+ *
|
|
+ * Which we track using:
|
|
+ *
|
|
+ * v0 := cfs_rq->min_vruntime
|
|
+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
|
|
+ * \Sum w_i := cfs_rq->avg_load
|
|
+ *
|
|
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
|
|
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
|
|
+ * maximal (virtual) lag induced in the system due to quantisation.
|
|
+ *
|
|
+ * Also, we use scale_load_down() to reduce the size.
|
|
+ *
|
|
+ * As measured, the max (key * weight) value was ~44 bits for a kernel build.
|
|
+ */
|
|
+static void
|
|
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ unsigned long weight = scale_load_down(se->load.weight);
|
|
+ s64 key = entity_key(cfs_rq, se);
|
|
+
|
|
+ cfs_rq->avg_vruntime += key * weight;
|
|
+ cfs_rq->avg_load += weight;
|
|
+}
|
|
+
|
|
+static void
|
|
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ unsigned long weight = scale_load_down(se->load.weight);
|
|
+ s64 key = entity_key(cfs_rq, se);
|
|
+
|
|
+ cfs_rq->avg_vruntime -= key * weight;
|
|
+ cfs_rq->avg_load -= weight;
|
|
+}
|
|
+
|
|
+static inline
|
|
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
|
|
+{
|
|
+ /*
|
|
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
|
|
+ */
|
|
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
|
|
+}
|
|
+
|
|
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
|
+{
|
|
+ struct sched_entity *curr = cfs_rq->curr;
|
|
+ s64 avg = cfs_rq->avg_vruntime;
|
|
+ long load = cfs_rq->avg_load;
|
|
+
|
|
+ if (curr && curr->on_rq) {
|
|
+ unsigned long weight = scale_load_down(curr->load.weight);
|
|
+
|
|
+ avg += entity_key(cfs_rq, curr) * weight;
|
|
+ load += weight;
|
|
+ }
|
|
+
|
|
+ if (load)
|
|
+ avg = div_s64(avg, load);
|
|
+
|
|
+ return cfs_rq->min_vruntime + avg;
|
|
+}
|
|
+
|
|
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
|
|
+{
|
|
+ u64 min_vruntime = cfs_rq->min_vruntime;
|
|
+ /*
|
|
+ * open coded max_vruntime() to allow updating avg_vruntime
|
|
+ */
|
|
+ s64 delta = (s64)(vruntime - min_vruntime);
|
|
+ if (delta > 0) {
|
|
+ avg_vruntime_update(cfs_rq, delta);
|
|
+ min_vruntime = vruntime;
|
|
+ }
|
|
+ return min_vruntime;
|
|
+}
|
|
+
|
|
static void update_min_vruntime(struct cfs_rq *cfs_rq)
|
|
{
|
|
struct sched_entity *curr = cfs_rq->curr;
|
|
@@ -629,7 +754,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
|
|
|
|
/* ensure we never gain time by being placed backwards. */
|
|
u64_u32_store(cfs_rq->min_vruntime,
|
|
- max_vruntime(cfs_rq->min_vruntime, vruntime));
|
|
+ __update_min_vruntime(cfs_rq, vruntime));
|
|
}
|
|
|
|
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
|
|
@@ -642,12 +767,14 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
|
|
*/
|
|
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
+ avg_vruntime_add(cfs_rq, se);
|
|
rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
|
|
}
|
|
|
|
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
|
|
+ avg_vruntime_sub(cfs_rq, se);
|
|
}
|
|
|
|
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
|
|
@@ -3379,6 +3506,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
/* commit outstanding execution time */
|
|
if (cfs_rq->curr == se)
|
|
update_curr(cfs_rq);
|
|
+ else
|
|
+ avg_vruntime_sub(cfs_rq, se);
|
|
update_load_sub(&cfs_rq->load, se->load.weight);
|
|
}
|
|
dequeue_load_avg(cfs_rq, se);
|
|
@@ -3394,9 +3523,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
#endif
|
|
|
|
enqueue_load_avg(cfs_rq, se);
|
|
- if (se->on_rq)
|
|
+ if (se->on_rq) {
|
|
update_load_add(&cfs_rq->load, se->load.weight);
|
|
-
|
|
+ if (cfs_rq->curr != se)
|
|
+ avg_vruntime_add(cfs_rq, se);
|
|
+ }
|
|
}
|
|
|
|
void reweight_task(struct task_struct *p, int prio)
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index e93e006a9..4ccb73d85 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -548,6 +548,9 @@ struct cfs_rq {
|
|
unsigned int idle_nr_running; /* SCHED_IDLE */
|
|
unsigned int idle_h_nr_running; /* SCHED_IDLE */
|
|
|
|
+ s64 avg_vruntime;
|
|
+ u64 avg_load;
|
|
+
|
|
u64 exec_clock;
|
|
u64 min_vruntime;
|
|
#ifdef CONFIG_SCHED_CORE
|
|
@@ -3480,4 +3483,6 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
|
|
static inline void init_sched_mm_cid(struct task_struct *t) { }
|
|
#endif
|
|
|
|
+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
|
|
+
|
|
#endif /* _KERNEL_SCHED_SCHED_H */
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 9839c6f0a4dec304f2577c71cc53fa5adab33ff4 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 31 May 2023 13:58:41 +0200
|
|
Subject: [PATCH 02/28] sched/fair: Remove sched_feat(START_DEBIT)
|
|
|
|
With the introduction of avg_vruntime() there is no need to use worse
|
|
approximations. Take the 0-lag point as starting point for inserting
|
|
new tasks.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230531124603.722361178@infradead.org
|
|
---
|
|
kernel/sched/fair.c | 21 +--------------------
|
|
kernel/sched/features.h | 6 ------
|
|
2 files changed, 1 insertion(+), 26 deletions(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 30587ec12..55f80b4a3 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -906,16 +906,6 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
return slice;
|
|
}
|
|
|
|
-/*
|
|
- * We calculate the vruntime slice of a to-be-inserted task.
|
|
- *
|
|
- * vs = s/w
|
|
- */
|
|
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
-{
|
|
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
|
|
-}
|
|
-
|
|
#include "pelt.h"
|
|
#ifdef CONFIG_SMP
|
|
|
|
@@ -4862,16 +4852,7 @@ static inline bool entity_is_long_sleeper(struct sched_entity *se)
|
|
static void
|
|
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
{
|
|
- u64 vruntime = cfs_rq->min_vruntime;
|
|
-
|
|
- /*
|
|
- * The 'current' period is already promised to the current tasks,
|
|
- * however the extra weight of the new task will slow them down a
|
|
- * little, place the new task so that it fits in the slot that
|
|
- * stays open at the end.
|
|
- */
|
|
- if (initial && sched_feat(START_DEBIT))
|
|
- vruntime += sched_vslice(cfs_rq, se);
|
|
+ u64 vruntime = avg_vruntime(cfs_rq);
|
|
|
|
/* sleeps up to a single latency don't count. */
|
|
if (!initial) {
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index ee7f23c76..fa828b365 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -6,12 +6,6 @@
|
|
*/
|
|
SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
|
|
|
|
-/*
|
|
- * Place new tasks ahead so that they do not starve already running
|
|
- * tasks
|
|
- */
|
|
-SCHED_FEAT(START_DEBIT, true)
|
|
-
|
|
/*
|
|
* Prefer to schedule the task we woke last (assuming it failed
|
|
* wakeup-preemption), since its likely going to consume data we
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From e2b83c59e712b31572aab92c651739b8577af01c Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 31 May 2023 13:58:42 +0200
|
|
Subject: [PATCH 03/28] sched/fair: Add lag based placement
|
|
|
|
With the introduction of avg_vruntime, it is possible to approximate
|
|
lag (the entire purpose of introducing it in fact). Use this to do lag
|
|
based placement over sleep+wake.
|
|
|
|
Specifically, the FAIR_SLEEPERS thing places things too far to the
|
|
left and messes up the deadline aspect of EEVDF.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230531124603.794929315@infradead.org
|
|
---
|
|
include/linux/sched.h | 3 +-
|
|
kernel/sched/core.c | 1 +
|
|
kernel/sched/fair.c | 168 +++++++++++++++++++++++++++++++---------
|
|
kernel/sched/features.h | 8 ++
|
|
4 files changed, 141 insertions(+), 39 deletions(-)
|
|
|
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
|
index 609bde814..52910bfb9 100644
|
|
--- a/include/linux/sched.h
|
|
+++ b/include/linux/sched.h
|
|
@@ -554,8 +554,9 @@ struct sched_entity {
|
|
|
|
u64 exec_start;
|
|
u64 sum_exec_runtime;
|
|
- u64 vruntime;
|
|
u64 prev_sum_exec_runtime;
|
|
+ u64 vruntime;
|
|
+ s64 vlag;
|
|
|
|
u64 nr_migrations;
|
|
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index e8f73ff12..acb9d9ff3 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -4501,6 +4501,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|
p->se.prev_sum_exec_runtime = 0;
|
|
p->se.nr_migrations = 0;
|
|
p->se.vruntime = 0;
|
|
+ p->se.vlag = 0;
|
|
INIT_LIST_HEAD(&p->se.group_node);
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 55f80b4a3..faccdbb14 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -715,6 +715,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
|
return cfs_rq->min_vruntime + avg;
|
|
}
|
|
|
|
+/*
|
|
+ * lag_i = S - s_i = w_i * (V - v_i)
|
|
+ */
|
|
+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ SCHED_WARN_ON(!se->on_rq);
|
|
+ se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
|
|
+}
|
|
+
|
|
static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
|
|
{
|
|
u64 min_vruntime = cfs_rq->min_vruntime;
|
|
@@ -3492,6 +3501,8 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
|
|
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
unsigned long weight)
|
|
{
|
|
+ unsigned long old_weight = se->load.weight;
|
|
+
|
|
if (se->on_rq) {
|
|
/* commit outstanding execution time */
|
|
if (cfs_rq->curr == se)
|
|
@@ -3504,6 +3515,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
|
|
update_load_set(&se->load, weight);
|
|
|
|
+ if (!se->on_rq) {
|
|
+ /*
|
|
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
|
|
+ * we need to scale se->vlag when w_i changes.
|
|
+ */
|
|
+ se->vlag = div_s64(se->vlag * old_weight, weight);
|
|
+ }
|
|
+
|
|
#ifdef CONFIG_SMP
|
|
do {
|
|
u32 divider = get_pelt_divider(&se->avg);
|
|
@@ -4853,49 +4872,119 @@ static void
|
|
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
{
|
|
u64 vruntime = avg_vruntime(cfs_rq);
|
|
+ s64 lag = 0;
|
|
|
|
- /* sleeps up to a single latency don't count. */
|
|
- if (!initial) {
|
|
- unsigned long thresh;
|
|
+ /*
|
|
+ * Due to how V is constructed as the weighted average of entities,
|
|
+ * adding tasks with positive lag, or removing tasks with negative lag
|
|
+ * will move 'time' backwards, this can screw around with the lag of
|
|
+ * other tasks.
|
|
+ *
|
|
+ * EEVDF: placement strategy #1 / #2
|
|
+ */
|
|
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
|
|
+ struct sched_entity *curr = cfs_rq->curr;
|
|
+ unsigned long load;
|
|
|
|
- if (se_is_idle(se))
|
|
- thresh = sysctl_sched_min_granularity;
|
|
- else
|
|
- thresh = sysctl_sched_latency;
|
|
+ lag = se->vlag;
|
|
|
|
/*
|
|
- * Halve their sleep time's effect, to allow
|
|
- * for a gentler effect of sleepers:
|
|
+ * If we want to place a task and preserve lag, we have to
|
|
+ * consider the effect of the new entity on the weighted
|
|
+ * average and compensate for this, otherwise lag can quickly
|
|
+ * evaporate.
|
|
+ *
|
|
+ * Lag is defined as:
|
|
+ *
|
|
+ * lag_i = S - s_i = w_i * (V - v_i)
|
|
+ *
|
|
+ * To avoid the 'w_i' term all over the place, we only track
|
|
+ * the virtual lag:
|
|
+ *
|
|
+ * vl_i = V - v_i <=> v_i = V - vl_i
|
|
+ *
|
|
+ * And we take V to be the weighted average of all v:
|
|
+ *
|
|
+ * V = (\Sum w_j*v_j) / W
|
|
+ *
|
|
+ * Where W is: \Sum w_j
|
|
+ *
|
|
+ * Then, the weighted average after adding an entity with lag
|
|
+ * vl_i is given by:
|
|
+ *
|
|
+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
|
|
+ * = (W*V + w_i*(V - vl_i)) / (W + w_i)
|
|
+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
|
|
+ * = (V*(W + w_i) - w_i*l) / (W + w_i)
|
|
+ * = V - w_i*vl_i / (W + w_i)
|
|
+ *
|
|
+ * And the actual lag after adding an entity with vl_i is:
|
|
+ *
|
|
+ * vl'_i = V' - v_i
|
|
+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
|
|
+ * = vl_i - w_i*vl_i / (W + w_i)
|
|
+ *
|
|
+ * Which is strictly less than vl_i. So in order to preserve lag
|
|
+ * we should inflate the lag before placement such that the
|
|
+ * effective lag after placement comes out right.
|
|
+ *
|
|
+ * As such, invert the above relation for vl'_i to get the vl_i
|
|
+ * we need to use such that the lag after placement is the lag
|
|
+ * we computed before dequeue.
|
|
+ *
|
|
+ * vl'_i = vl_i - w_i*vl_i / (W + w_i)
|
|
+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
|
|
+ *
|
|
+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
|
|
+ * = W*vl_i
|
|
+ *
|
|
+ * vl_i = (W + w_i)*vl'_i / W
|
|
*/
|
|
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
|
|
- thresh >>= 1;
|
|
-
|
|
- vruntime -= thresh;
|
|
- }
|
|
-
|
|
- /*
|
|
- * Pull vruntime of the entity being placed to the base level of
|
|
- * cfs_rq, to prevent boosting it if placed backwards.
|
|
- * However, min_vruntime can advance much faster than real time, with
|
|
- * the extreme being when an entity with the minimal weight always runs
|
|
- * on the cfs_rq. If the waking entity slept for a long time, its
|
|
- * vruntime difference from min_vruntime may overflow s64 and their
|
|
- * comparison may get inversed, so ignore the entity's original
|
|
- * vruntime in that case.
|
|
- * The maximal vruntime speedup is given by the ratio of normal to
|
|
- * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
|
|
- * When placing a migrated waking entity, its exec_start has been set
|
|
- * from a different rq. In order to take into account a possible
|
|
- * divergence between new and prev rq's clocks task because of irq and
|
|
- * stolen time, we take an additional margin.
|
|
- * So, cutting off on the sleep time of
|
|
- * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
|
|
- * should be safe.
|
|
- */
|
|
- if (entity_is_long_sleeper(se))
|
|
- se->vruntime = vruntime;
|
|
- else
|
|
- se->vruntime = max_vruntime(se->vruntime, vruntime);
|
|
+ load = cfs_rq->avg_load;
|
|
+ if (curr && curr->on_rq)
|
|
+ load += curr->load.weight;
|
|
+
|
|
+ lag *= load + se->load.weight;
|
|
+ if (WARN_ON_ONCE(!load))
|
|
+ load = 1;
|
|
+ lag = div_s64(lag, load);
|
|
+
|
|
+ vruntime -= lag;
|
|
+ }
|
|
+
|
|
+ if (sched_feat(FAIR_SLEEPERS)) {
|
|
+
|
|
+ /* sleeps up to a single latency don't count. */
|
|
+ if (!initial) {
|
|
+ unsigned long thresh;
|
|
+
|
|
+ if (se_is_idle(se))
|
|
+ thresh = sysctl_sched_min_granularity;
|
|
+ else
|
|
+ thresh = sysctl_sched_latency;
|
|
+
|
|
+ /*
|
|
+ * Halve their sleep time's effect, to allow
|
|
+ * for a gentler effect of sleepers:
|
|
+ */
|
|
+ if (sched_feat(GENTLE_FAIR_SLEEPERS))
|
|
+ thresh >>= 1;
|
|
+
|
|
+ vruntime -= thresh;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Pull vruntime of the entity being placed to the base level of
|
|
+ * cfs_rq, to prevent boosting it if placed backwards. If the entity
|
|
+ * slept for a long time, don't even try to compare its vruntime with
|
|
+ * the base as it may be too far off and the comparison may get
|
|
+ * inversed due to s64 overflow.
|
|
+ */
|
|
+ if (!entity_is_long_sleeper(se))
|
|
+ vruntime = max_vruntime(se->vruntime, vruntime);
|
|
+ }
|
|
+
|
|
+ se->vruntime = vruntime;
|
|
}
|
|
|
|
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
|
|
@@ -5066,6 +5155,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
|
clear_buddies(cfs_rq, se);
|
|
|
|
+ if (flags & DEQUEUE_SLEEP)
|
|
+ update_entity_lag(cfs_rq, se);
|
|
+
|
|
if (se != cfs_rq->curr)
|
|
__dequeue_entity(cfs_rq, se);
|
|
se->on_rq = 0;
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index fa828b365..7958a10fe 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -1,11 +1,19 @@
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
+
|
|
/*
|
|
* Only give sleepers 50% of their service deficit. This allows
|
|
* them to run sooner, but does not allow tons of sleepers to
|
|
* rip the spread apart.
|
|
*/
|
|
+SCHED_FEAT(FAIR_SLEEPERS, false)
|
|
SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
|
|
|
|
+/*
|
|
+ * Using the avg_vruntime, do the right thing and preserve lag across
|
|
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
|
|
+ */
|
|
+SCHED_FEAT(PLACE_LAG, true)
|
|
+
|
|
/*
|
|
* Prefer to schedule the task we woke last (assuming it failed
|
|
* wakeup-preemption), since its likely going to consume data we
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 84e74d6704600343fcf3d4e2d6e8ce4d4228d8b1 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 31 May 2023 13:58:43 +0200
|
|
Subject: [PATCH 04/28] rbtree: Add rb_add_augmented_cached() helper
|
|
|
|
While slightly sub-optimal, updating the augmented data while going
|
|
down the tree during lookup would be faster -- alas the augment
|
|
interface does not currently allow for that, provide a generic helper
|
|
to add a node to an augmented cached tree.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230531124603.862983648@infradead.org
|
|
---
|
|
include/linux/rbtree_augmented.h | 26 ++++++++++++++++++++++++++
|
|
1 file changed, 26 insertions(+)
|
|
|
|
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
|
|
index 7ee7ed5de..6dbc5a1bf 100644
|
|
--- a/include/linux/rbtree_augmented.h
|
|
+++ b/include/linux/rbtree_augmented.h
|
|
@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node,
|
|
rb_insert_augmented(node, &root->rb_root, augment);
|
|
}
|
|
|
|
+static __always_inline struct rb_node *
|
|
+rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
|
|
+ bool (*less)(struct rb_node *, const struct rb_node *),
|
|
+ const struct rb_augment_callbacks *augment)
|
|
+{
|
|
+ struct rb_node **link = &tree->rb_root.rb_node;
|
|
+ struct rb_node *parent = NULL;
|
|
+ bool leftmost = true;
|
|
+
|
|
+ while (*link) {
|
|
+ parent = *link;
|
|
+ if (less(node, parent)) {
|
|
+ link = &parent->rb_left;
|
|
+ } else {
|
|
+ link = &parent->rb_right;
|
|
+ leftmost = false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ rb_link_node(node, parent, link);
|
|
+ augment->propagate(parent, NULL); /* suboptimal */
|
|
+ rb_insert_augmented_cached(node, tree, leftmost, augment);
|
|
+
|
|
+ return leftmost ? node : NULL;
|
|
+}
|
|
+
|
|
/*
|
|
* Template for declaring augmented rbtree callbacks (generic case)
|
|
*
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 6439cd7527a2c4c59045cf79f60192116441288a Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 31 May 2023 13:58:44 +0200
|
|
Subject: [PATCH 05/28] sched/fair: Implement an EEVDF-like scheduling policy
|
|
|
|
Where CFS is currently a WFQ based scheduler with only a single knob,
|
|
the weight. The addition of a second, latency oriented parameter,
|
|
makes something like WF2Q or EEVDF based a much better fit.
|
|
|
|
Specifically, EEVDF does EDF like scheduling in the left half of the
|
|
tree -- those entities that are owed service. Except because this is a
|
|
virtual time scheduler, the deadlines are in virtual time as well,
|
|
which is what allows over-subscription.
|
|
|
|
EEVDF has two parameters:
|
|
|
|
- weight, or time-slope: which is mapped to nice just as before
|
|
|
|
- request size, or slice length: which is used to compute
|
|
the virtual deadline as: vd_i = ve_i + r_i/w_i
|
|
|
|
Basically, by setting a smaller slice, the deadline will be earlier
|
|
and the task will be more eligible and ran earlier.
|
|
|
|
Tick driven preemption is driven by request/slice completion; while
|
|
wakeup preemption is driven by the deadline.
|
|
|
|
Because the tree is now effectively an interval tree, and the
|
|
selection is no longer 'leftmost', over-scheduling is less of a
|
|
problem.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230531124603.931005524@infradead.org
|
|
---
|
|
include/linux/sched.h | 4 +
|
|
kernel/sched/core.c | 1 +
|
|
kernel/sched/debug.c | 6 +-
|
|
kernel/sched/fair.c | 338 ++++++++++++++++++++++++++++++++++------
|
|
kernel/sched/features.h | 3 +
|
|
kernel/sched/sched.h | 4 +-
|
|
6 files changed, 308 insertions(+), 48 deletions(-)
|
|
|
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
|
index 52910bfb9..35331c35f 100644
|
|
--- a/include/linux/sched.h
|
|
+++ b/include/linux/sched.h
|
|
@@ -549,6 +549,9 @@ struct sched_entity {
|
|
/* For load-balancing: */
|
|
struct load_weight load;
|
|
struct rb_node run_node;
|
|
+ u64 deadline;
|
|
+ u64 min_deadline;
|
|
+
|
|
struct list_head group_node;
|
|
unsigned int on_rq;
|
|
|
|
@@ -557,6 +560,7 @@ struct sched_entity {
|
|
u64 prev_sum_exec_runtime;
|
|
u64 vruntime;
|
|
s64 vlag;
|
|
+ u64 slice;
|
|
|
|
u64 nr_migrations;
|
|
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index acb9d9ff3..427d694ff 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -4502,6 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|
p->se.nr_migrations = 0;
|
|
p->se.vruntime = 0;
|
|
p->se.vlag = 0;
|
|
+ p->se.slice = sysctl_sched_min_granularity;
|
|
INIT_LIST_HEAD(&p->se.group_node);
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
|
index 6d4c33402..d4cca3b2c 100644
|
|
--- a/kernel/sched/debug.c
|
|
+++ b/kernel/sched/debug.c
|
|
@@ -581,9 +581,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
|
|
else
|
|
SEQ_printf(m, " %c", task_state_to_char(p));
|
|
|
|
- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
|
|
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
|
|
p->comm, task_pid_nr(p),
|
|
SPLIT_NS(p->se.vruntime),
|
|
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
|
|
+ SPLIT_NS(p->se.deadline),
|
|
+ SPLIT_NS(p->se.slice),
|
|
+ SPLIT_NS(p->se.sum_exec_runtime),
|
|
(long long)(p->nvcsw + p->nivcsw),
|
|
p->prio);
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index faccdbb14..3c3ff0887 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -47,6 +47,7 @@
|
|
#include <linux/psi.h>
|
|
#include <linux/ratelimit.h>
|
|
#include <linux/task_work.h>
|
|
+#include <linux/rbtree_augmented.h>
|
|
|
|
#include <asm/switch_to.h>
|
|
|
|
@@ -347,6 +348,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
|
|
return mul_u64_u32_shr(delta_exec, fact, shift);
|
|
}
|
|
|
|
+/*
|
|
+ * delta /= w
|
|
+ */
|
|
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
|
|
+{
|
|
+ if (unlikely(se->load.weight != NICE_0_LOAD))
|
|
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
|
|
+
|
|
+ return delta;
|
|
+}
|
|
|
|
const struct sched_class fair_sched_class;
|
|
|
|
@@ -717,11 +728,62 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
|
|
|
/*
|
|
* lag_i = S - s_i = w_i * (V - v_i)
|
|
+ *
|
|
+ * However, since V is approximated by the weighted average of all entities it
|
|
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
|
|
+ * and end up with a larger lag than we started with.
|
|
+ *
|
|
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
|
|
+ * since that is the timing granularity.
|
|
+ *
|
|
+ * EEVDF gives the following limit for a steady state system:
|
|
+ *
|
|
+ * -r_max < lag < max(r_max, q)
|
|
+ *
|
|
+ * XXX could add max_slice to the augmented data to track this.
|
|
*/
|
|
void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
+ s64 lag, limit;
|
|
+
|
|
SCHED_WARN_ON(!se->on_rq);
|
|
- se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
|
|
+ lag = avg_vruntime(cfs_rq) - se->vruntime;
|
|
+
|
|
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
|
|
+ se->vlag = clamp(lag, -limit, limit);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Entity is eligible once it received less service than it ought to have,
|
|
+ * eg. lag >= 0.
|
|
+ *
|
|
+ * lag_i = S - s_i = w_i*(V - v_i)
|
|
+ *
|
|
+ * lag_i >= 0 -> V >= v_i
|
|
+ *
|
|
+ * \Sum (v_i - v)*w_i
|
|
+ * V = ------------------ + v
|
|
+ * \Sum w_i
|
|
+ *
|
|
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
|
|
+ *
|
|
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
|
|
+ * to the loss in precision caused by the division.
|
|
+ */
|
|
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ struct sched_entity *curr = cfs_rq->curr;
|
|
+ s64 avg = cfs_rq->avg_vruntime;
|
|
+ long load = cfs_rq->avg_load;
|
|
+
|
|
+ if (curr && curr->on_rq) {
|
|
+ unsigned long weight = scale_load_down(curr->load.weight);
|
|
+
|
|
+ avg += entity_key(cfs_rq, curr) * weight;
|
|
+ load += weight;
|
|
+ }
|
|
+
|
|
+ return avg >= entity_key(cfs_rq, se) * load;
|
|
}
|
|
|
|
static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
|
|
@@ -740,8 +802,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
|
|
|
|
static void update_min_vruntime(struct cfs_rq *cfs_rq)
|
|
{
|
|
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
|
|
struct sched_entity *curr = cfs_rq->curr;
|
|
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
|
|
|
|
u64 vruntime = cfs_rq->min_vruntime;
|
|
|
|
@@ -752,9 +814,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
|
|
curr = NULL;
|
|
}
|
|
|
|
- if (leftmost) { /* non-empty tree */
|
|
- struct sched_entity *se = __node_2_se(leftmost);
|
|
-
|
|
+ if (se) {
|
|
if (!curr)
|
|
vruntime = se->vruntime;
|
|
else
|
|
@@ -771,18 +831,50 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
|
|
return entity_before(__node_2_se(a), __node_2_se(b));
|
|
}
|
|
|
|
+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
|
|
+
|
|
+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
|
|
+{
|
|
+ if (node) {
|
|
+ struct sched_entity *rse = __node_2_se(node);
|
|
+ if (deadline_gt(min_deadline, se, rse))
|
|
+ se->min_deadline = rse->min_deadline;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
|
|
+ */
|
|
+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
|
|
+{
|
|
+ u64 old_min_deadline = se->min_deadline;
|
|
+ struct rb_node *node = &se->run_node;
|
|
+
|
|
+ se->min_deadline = se->deadline;
|
|
+ __update_min_deadline(se, node->rb_right);
|
|
+ __update_min_deadline(se, node->rb_left);
|
|
+
|
|
+ return se->min_deadline == old_min_deadline;
|
|
+}
|
|
+
|
|
+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
|
|
+ run_node, min_deadline, min_deadline_update);
|
|
+
|
|
/*
|
|
* Enqueue an entity into the rb-tree:
|
|
*/
|
|
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
avg_vruntime_add(cfs_rq, se);
|
|
- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
|
|
+ se->min_deadline = se->deadline;
|
|
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
|
|
+ __entity_less, &min_deadline_cb);
|
|
}
|
|
|
|
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
|
|
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
|
|
+ &min_deadline_cb);
|
|
avg_vruntime_sub(cfs_rq, se);
|
|
}
|
|
|
|
@@ -806,6 +898,97 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
|
|
return __node_2_se(next);
|
|
}
|
|
|
|
+static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
+{
|
|
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
|
|
+
|
|
+ /*
|
|
+ * If curr is set we have to see if its left of the leftmost entity
|
|
+ * still in the tree, provided there was anything in the tree at all.
|
|
+ */
|
|
+ if (!left || (curr && entity_before(curr, left)))
|
|
+ left = curr;
|
|
+
|
|
+ return left;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Earliest Eligible Virtual Deadline First
|
|
+ *
|
|
+ * In order to provide latency guarantees for different request sizes
|
|
+ * EEVDF selects the best runnable task from two criteria:
|
|
+ *
|
|
+ * 1) the task must be eligible (must be owed service)
|
|
+ *
|
|
+ * 2) from those tasks that meet 1), we select the one
|
|
+ * with the earliest virtual deadline.
|
|
+ *
|
|
+ * We can do this in O(log n) time due to an augmented RB-tree. The
|
|
+ * tree keeps the entries sorted on service, but also functions as a
|
|
+ * heap based on the deadline by keeping:
|
|
+ *
|
|
+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
|
|
+ *
|
|
+ * Which allows an EDF like search on (sub)trees.
|
|
+ */
|
|
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
|
|
+{
|
|
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
|
|
+ struct sched_entity *curr = cfs_rq->curr;
|
|
+ struct sched_entity *best = NULL;
|
|
+
|
|
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
|
|
+ curr = NULL;
|
|
+
|
|
+ while (node) {
|
|
+ struct sched_entity *se = __node_2_se(node);
|
|
+
|
|
+ /*
|
|
+ * If this entity is not eligible, try the left subtree.
|
|
+ */
|
|
+ if (!entity_eligible(cfs_rq, se)) {
|
|
+ node = node->rb_left;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If this entity has an earlier deadline than the previous
|
|
+ * best, take this one. If it also has the earliest deadline
|
|
+ * of its subtree, we're done.
|
|
+ */
|
|
+ if (!best || deadline_gt(deadline, best, se)) {
|
|
+ best = se;
|
|
+ if (best->deadline == best->min_deadline)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If the earlest deadline in this subtree is in the fully
|
|
+ * eligible left half of our space, go there.
|
|
+ */
|
|
+ if (node->rb_left &&
|
|
+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
|
|
+ node = node->rb_left;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ node = node->rb_right;
|
|
+ }
|
|
+
|
|
+ if (!best || (curr && deadline_gt(deadline, best, curr)))
|
|
+ best = curr;
|
|
+
|
|
+ if (unlikely(!best)) {
|
|
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
|
|
+ if (left) {
|
|
+ pr_err("EEVDF scheduling fail, picking leftmost\n");
|
|
+ return left;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return best;
|
|
+}
|
|
+
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
|
|
{
|
|
@@ -839,17 +1022,6 @@ int sched_update_scaling(void)
|
|
}
|
|
#endif
|
|
|
|
-/*
|
|
- * delta /= w
|
|
- */
|
|
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
|
|
-{
|
|
- if (unlikely(se->load.weight != NICE_0_LOAD))
|
|
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
|
|
-
|
|
- return delta;
|
|
-}
|
|
-
|
|
/*
|
|
* The idea is to set a period in which each task runs once.
|
|
*
|
|
@@ -915,6 +1087,48 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
return slice;
|
|
}
|
|
|
|
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
|
|
+
|
|
+/*
|
|
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
|
|
+ * this is probably good enough.
|
|
+ */
|
|
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ if ((s64)(se->vruntime - se->deadline) < 0)
|
|
+ return;
|
|
+
|
|
+ if (sched_feat(EEVDF)) {
|
|
+ /*
|
|
+ * For EEVDF the virtual time slope is determined by w_i (iow.
|
|
+ * nice) while the request time r_i is determined by
|
|
+ * sysctl_sched_min_granularity.
|
|
+ */
|
|
+ se->slice = sysctl_sched_min_granularity;
|
|
+
|
|
+ /*
|
|
+ * The task has consumed its request, reschedule.
|
|
+ */
|
|
+ if (cfs_rq->nr_running > 1) {
|
|
+ resched_curr(rq_of(cfs_rq));
|
|
+ clear_buddies(cfs_rq, se);
|
|
+ }
|
|
+ } else {
|
|
+ /*
|
|
+ * When many tasks blow up the sched_period; it is possible
|
|
+ * that sched_slice() reports unusually large results (when
|
|
+ * many tasks are very light for example). Therefore impose a
|
|
+ * maximum.
|
|
+ */
|
|
+ se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * EEVDF: vd_i = ve_i + r_i / w_i
|
|
+ */
|
|
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
|
|
+}
|
|
+
|
|
#include "pelt.h"
|
|
#ifdef CONFIG_SMP
|
|
|
|
@@ -1047,6 +1261,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
|
|
schedstat_add(cfs_rq->exec_clock, delta_exec);
|
|
|
|
curr->vruntime += calc_delta_fair(delta_exec, curr);
|
|
+ update_deadline(cfs_rq, curr);
|
|
update_min_vruntime(cfs_rq);
|
|
|
|
if (entity_is_task(curr)) {
|
|
@@ -3521,6 +3736,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
* we need to scale se->vlag when w_i changes.
|
|
*/
|
|
se->vlag = div_s64(se->vlag * old_weight, weight);
|
|
+ } else {
|
|
+ s64 deadline = se->deadline - se->vruntime;
|
|
+ /*
|
|
+ * When the weight changes, the virtual time slope changes and
|
|
+ * we should adjust the relative virtual deadline accordingly.
|
|
+ */
|
|
+ deadline = div_s64(deadline * old_weight, weight);
|
|
+ se->deadline = se->vruntime + deadline;
|
|
}
|
|
|
|
#ifdef CONFIG_SMP
|
|
@@ -4871,6 +5094,7 @@ static inline bool entity_is_long_sleeper(struct sched_entity *se)
|
|
static void
|
|
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
{
|
|
+ u64 vslice = calc_delta_fair(se->slice, se);
|
|
u64 vruntime = avg_vruntime(cfs_rq);
|
|
s64 lag = 0;
|
|
|
|
@@ -4942,9 +5166,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
*/
|
|
load = cfs_rq->avg_load;
|
|
if (curr && curr->on_rq)
|
|
- load += curr->load.weight;
|
|
+ load += scale_load_down(curr->load.weight);
|
|
|
|
- lag *= load + se->load.weight;
|
|
+ lag *= load + scale_load_down(se->load.weight);
|
|
if (WARN_ON_ONCE(!load))
|
|
load = 1;
|
|
lag = div_s64(lag, load);
|
|
@@ -4985,6 +5209,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
}
|
|
|
|
se->vruntime = vruntime;
|
|
+
|
|
+ /*
|
|
+ * When joining the competition; the exisiting tasks will be,
|
|
+ * on average, halfway through their slice, as such start tasks
|
|
+ * off with half a slice to ease into the competition.
|
|
+ */
|
|
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && initial)
|
|
+ vslice /= 2;
|
|
+
|
|
+ /*
|
|
+ * EEVDF: vd_i = ve_i + r_i/w_i
|
|
+ */
|
|
+ se->deadline = se->vruntime + vslice;
|
|
}
|
|
|
|
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
|
|
@@ -5196,19 +5433,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
static void
|
|
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
{
|
|
- unsigned long ideal_runtime, delta_exec;
|
|
+ unsigned long delta_exec;
|
|
struct sched_entity *se;
|
|
s64 delta;
|
|
|
|
- /*
|
|
- * When many tasks blow up the sched_period; it is possible that
|
|
- * sched_slice() reports unusually large results (when many tasks are
|
|
- * very light for example). Therefore impose a maximum.
|
|
- */
|
|
- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
|
|
-
|
|
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
|
|
- if (delta_exec > ideal_runtime) {
|
|
+ if (delta_exec > curr->slice) {
|
|
resched_curr(rq_of(cfs_rq));
|
|
/*
|
|
* The current task ran long enough, ensure it doesn't get
|
|
@@ -5232,7 +5462,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
if (delta < 0)
|
|
return;
|
|
|
|
- if (delta > ideal_runtime)
|
|
+ if (delta > curr->slice)
|
|
resched_curr(rq_of(cfs_rq));
|
|
}
|
|
|
|
@@ -5287,17 +5517,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
|
|
static struct sched_entity *
|
|
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
{
|
|
- struct sched_entity *left = __pick_first_entity(cfs_rq);
|
|
- struct sched_entity *se;
|
|
+ struct sched_entity *left, *se;
|
|
|
|
- /*
|
|
- * If curr is set we have to see if its left of the leftmost entity
|
|
- * still in the tree, provided there was anything in the tree at all.
|
|
- */
|
|
- if (!left || (curr && entity_before(curr, left)))
|
|
- left = curr;
|
|
+ if (sched_feat(EEVDF)) {
|
|
+ /*
|
|
+ * Enabling NEXT_BUDDY will affect latency but not fairness.
|
|
+ */
|
|
+ if (sched_feat(NEXT_BUDDY) &&
|
|
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
|
|
+ return cfs_rq->next;
|
|
+
|
|
+ return pick_eevdf(cfs_rq);
|
|
+ }
|
|
|
|
- se = left; /* ideally we run the leftmost entity */
|
|
+ se = left = pick_cfs(cfs_rq, curr);
|
|
|
|
/*
|
|
* Avoid running the skip buddy, if running something else can
|
|
@@ -5390,7 +5623,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
|
|
return;
|
|
#endif
|
|
|
|
- if (cfs_rq->nr_running > 1)
|
|
+ if (!sched_feat(EEVDF) && cfs_rq->nr_running > 1)
|
|
check_preempt_tick(cfs_rq, curr);
|
|
}
|
|
|
|
@@ -6414,13 +6647,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
|
|
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
|
|
{
|
|
struct sched_entity *se = &p->se;
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
SCHED_WARN_ON(task_rq(p) != rq);
|
|
|
|
if (rq->cfs.h_nr_running > 1) {
|
|
- u64 slice = sched_slice(cfs_rq, se);
|
|
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
|
|
+ u64 slice = se->slice;
|
|
s64 delta = slice - ran;
|
|
|
|
if (delta < 0) {
|
|
@@ -8194,7 +8426,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|
if (cse_is_idle != pse_is_idle)
|
|
return;
|
|
|
|
- update_curr(cfs_rq_of(se));
|
|
+ cfs_rq = cfs_rq_of(se);
|
|
+ update_curr(cfs_rq);
|
|
+
|
|
+ if (sched_feat(EEVDF)) {
|
|
+ /*
|
|
+ * XXX pick_eevdf(cfs_rq) != se ?
|
|
+ */
|
|
+ if (pick_eevdf(cfs_rq) == pse)
|
|
+ goto preempt;
|
|
+
|
|
+ return;
|
|
+ }
|
|
+
|
|
if (wakeup_preempt_entity(se, pse) == 1) {
|
|
/*
|
|
* Bias pick_next to pick the sched entity that is
|
|
@@ -8440,7 +8684,7 @@ static void yield_task_fair(struct rq *rq)
|
|
|
|
clear_buddies(cfs_rq, se);
|
|
|
|
- if (curr->policy != SCHED_BATCH) {
|
|
+ if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) {
|
|
update_rq_clock(rq);
|
|
/*
|
|
* Update run-time statistics of the 'current'.
|
|
@@ -8453,6 +8697,8 @@ static void yield_task_fair(struct rq *rq)
|
|
*/
|
|
rq_clock_skip_update(rq);
|
|
}
|
|
+ if (sched_feat(EEVDF))
|
|
+ se->deadline += calc_delta_fair(se->slice, se);
|
|
|
|
set_skip_buddy(se);
|
|
}
|
|
@@ -12208,8 +12454,8 @@ static void rq_offline_fair(struct rq *rq)
|
|
static inline bool
|
|
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
|
|
{
|
|
- u64 slice = sched_slice(cfs_rq_of(se), se);
|
|
u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
|
|
+ u64 slice = se->slice;
|
|
|
|
return (rtime * min_nr_tasks > slice);
|
|
}
|
|
@@ -12904,7 +13150,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
|
|
* idle runqueue:
|
|
*/
|
|
if (rq->cfs.load.weight)
|
|
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
|
|
+ rr_interval = NS_TO_JIFFIES(se->slice);
|
|
|
|
return rr_interval;
|
|
}
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index 7958a10fe..60cce1e6f 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -13,6 +13,7 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
|
|
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
|
|
*/
|
|
SCHED_FEAT(PLACE_LAG, true)
|
|
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
|
|
|
|
/*
|
|
* Prefer to schedule the task we woke last (assuming it failed
|
|
@@ -103,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false)
|
|
|
|
SCHED_FEAT(ALT_PERIOD, true)
|
|
SCHED_FEAT(BASE_SLICE, true)
|
|
+
|
|
+SCHED_FEAT(EEVDF, true)
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index 4ccb73d85..1fc81dd7f 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -2502,9 +2502,10 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
|
|
extern const_debug unsigned int sysctl_sched_nr_migrate;
|
|
extern const_debug unsigned int sysctl_sched_migration_cost;
|
|
|
|
+extern unsigned int sysctl_sched_min_granularity;
|
|
+
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
extern unsigned int sysctl_sched_latency;
|
|
-extern unsigned int sysctl_sched_min_granularity;
|
|
extern unsigned int sysctl_sched_idle_min_granularity;
|
|
extern unsigned int sysctl_sched_wakeup_granularity;
|
|
extern int sysctl_resched_latency_warn_ms;
|
|
@@ -3484,5 +3485,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
|
|
#endif
|
|
|
|
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
|
|
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
|
|
|
|
#endif /* _KERNEL_SCHED_SCHED_H */
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From c29b5b1a88b3ed15813fc58e9a1d41e64a1d6511 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 31 May 2023 13:58:45 +0200
|
|
Subject: [PATCH 06/28] sched/fair: Commit to lag based placement
|
|
|
|
Removes the FAIR_SLEEPERS code in favour of the new LAG based
|
|
placement.
|
|
|
|
Specifically, the whole FAIR_SLEEPER thing was a very crude
|
|
approximation to make up for the lack of lag based placement,
|
|
specifically the 'service owed' part. This is important for things
|
|
like 'starve' and 'hackbench'.
|
|
|
|
One side effect of FAIR_SLEEPER is that it caused 'small' unfairness,
|
|
specifically, by always ignoring up-to 'thresh' sleeptime it would
|
|
have a 50%/50% time distribution for a 50% sleeper vs a 100% runner,
|
|
while strictly speaking this should (of course) result in a 33%/67%
|
|
split (as CFS will also do if the sleep period exceeds 'thresh').
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230531124604.000198861@infradead.org
|
|
---
|
|
kernel/sched/fair.c | 59 +----------------------------------------
|
|
kernel/sched/features.h | 8 ------
|
|
2 files changed, 1 insertion(+), 66 deletions(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 3c3ff0887..91f25d6c8 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -5068,29 +5068,6 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
#endif
|
|
}
|
|
|
|
-static inline bool entity_is_long_sleeper(struct sched_entity *se)
|
|
-{
|
|
- struct cfs_rq *cfs_rq;
|
|
- u64 sleep_time;
|
|
-
|
|
- if (se->exec_start == 0)
|
|
- return false;
|
|
-
|
|
- cfs_rq = cfs_rq_of(se);
|
|
-
|
|
- sleep_time = rq_clock_task(rq_of(cfs_rq));
|
|
-
|
|
- /* Happen while migrating because of clock task divergence */
|
|
- if (sleep_time <= se->exec_start)
|
|
- return false;
|
|
-
|
|
- sleep_time -= se->exec_start;
|
|
- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
|
|
- return true;
|
|
-
|
|
- return false;
|
|
-}
|
|
-
|
|
static void
|
|
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
{
|
|
@@ -5172,43 +5149,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
if (WARN_ON_ONCE(!load))
|
|
load = 1;
|
|
lag = div_s64(lag, load);
|
|
-
|
|
- vruntime -= lag;
|
|
- }
|
|
-
|
|
- if (sched_feat(FAIR_SLEEPERS)) {
|
|
-
|
|
- /* sleeps up to a single latency don't count. */
|
|
- if (!initial) {
|
|
- unsigned long thresh;
|
|
-
|
|
- if (se_is_idle(se))
|
|
- thresh = sysctl_sched_min_granularity;
|
|
- else
|
|
- thresh = sysctl_sched_latency;
|
|
-
|
|
- /*
|
|
- * Halve their sleep time's effect, to allow
|
|
- * for a gentler effect of sleepers:
|
|
- */
|
|
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
|
|
- thresh >>= 1;
|
|
-
|
|
- vruntime -= thresh;
|
|
- }
|
|
-
|
|
- /*
|
|
- * Pull vruntime of the entity being placed to the base level of
|
|
- * cfs_rq, to prevent boosting it if placed backwards. If the entity
|
|
- * slept for a long time, don't even try to compare its vruntime with
|
|
- * the base as it may be too far off and the comparison may get
|
|
- * inversed due to s64 overflow.
|
|
- */
|
|
- if (!entity_is_long_sleeper(se))
|
|
- vruntime = max_vruntime(se->vruntime, vruntime);
|
|
}
|
|
|
|
- se->vruntime = vruntime;
|
|
+ se->vruntime = vruntime - lag;
|
|
|
|
/*
|
|
* When joining the competition; the exisiting tasks will be,
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index 60cce1e6f..2a830eccd 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -1,13 +1,5 @@
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
-/*
|
|
- * Only give sleepers 50% of their service deficit. This allows
|
|
- * them to run sooner, but does not allow tons of sleepers to
|
|
- * rip the spread apart.
|
|
- */
|
|
-SCHED_FEAT(FAIR_SLEEPERS, false)
|
|
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
|
|
-
|
|
/*
|
|
* Using the avg_vruntime, do the right thing and preserve lag across
|
|
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From e891794bcc3eb5eb3e5942ec269f79355b7d0d8f Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 31 May 2023 13:58:46 +0200
|
|
Subject: [PATCH 07/28] sched/smp: Use lag to simplify cross-runqueue placement
|
|
|
|
Using lag is both more correct and simpler when moving between
|
|
runqueues.
|
|
|
|
Notable, min_vruntime() was invented as a cheap approximation of
|
|
avg_vruntime() for this very purpose (SMP migration). Since we now
|
|
have the real thing; use it.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230531124604.068911180@infradead.org
|
|
---
|
|
kernel/sched/fair.c | 145 ++++++--------------------------------------
|
|
1 file changed, 19 insertions(+), 126 deletions(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 91f25d6c8..b7daccfb2 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -5083,7 +5083,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
*
|
|
* EEVDF: placement strategy #1 / #2
|
|
*/
|
|
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
|
|
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
|
|
struct sched_entity *curr = cfs_rq->curr;
|
|
unsigned long load;
|
|
|
|
@@ -5171,60 +5171,20 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
|
|
|
|
static inline bool cfs_bandwidth_used(void);
|
|
|
|
-/*
|
|
- * MIGRATION
|
|
- *
|
|
- * dequeue
|
|
- * update_curr()
|
|
- * update_min_vruntime()
|
|
- * vruntime -= min_vruntime
|
|
- *
|
|
- * enqueue
|
|
- * update_curr()
|
|
- * update_min_vruntime()
|
|
- * vruntime += min_vruntime
|
|
- *
|
|
- * this way the vruntime transition between RQs is done when both
|
|
- * min_vruntime are up-to-date.
|
|
- *
|
|
- * WAKEUP (remote)
|
|
- *
|
|
- * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
|
|
- * vruntime -= min_vruntime
|
|
- *
|
|
- * enqueue
|
|
- * update_curr()
|
|
- * update_min_vruntime()
|
|
- * vruntime += min_vruntime
|
|
- *
|
|
- * this way we don't have the most up-to-date min_vruntime on the originating
|
|
- * CPU and an up-to-date min_vruntime on the destination CPU.
|
|
- */
|
|
-
|
|
static void
|
|
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
{
|
|
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
|
|
bool curr = cfs_rq->curr == se;
|
|
|
|
/*
|
|
* If we're the current task, we must renormalise before calling
|
|
* update_curr().
|
|
*/
|
|
- if (renorm && curr)
|
|
- se->vruntime += cfs_rq->min_vruntime;
|
|
+ if (curr)
|
|
+ place_entity(cfs_rq, se, 0);
|
|
|
|
update_curr(cfs_rq);
|
|
|
|
- /*
|
|
- * Otherwise, renormalise after, such that we're placed at the current
|
|
- * moment in time, instead of some random moment in the past. Being
|
|
- * placed in the past could significantly boost this task to the
|
|
- * fairness detriment of existing tasks.
|
|
- */
|
|
- if (renorm && !curr)
|
|
- se->vruntime += cfs_rq->min_vruntime;
|
|
-
|
|
/*
|
|
* When enqueuing a sched_entity, we must:
|
|
* - Update loads to have both entity and cfs_rq synced with now.
|
|
@@ -5236,11 +5196,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
*/
|
|
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
|
|
se_update_runnable(se);
|
|
+ /*
|
|
+ * XXX update_load_avg() above will have attached us to the pelt sum;
|
|
+ * but update_cfs_group() here will re-adjust the weight and have to
|
|
+ * undo/redo all that. Seems wasteful.
|
|
+ */
|
|
update_cfs_group(se);
|
|
- account_entity_enqueue(cfs_rq, se);
|
|
|
|
- if (flags & ENQUEUE_WAKEUP)
|
|
+ /*
|
|
+ * XXX now that the entity has been re-weighted, and it's lag adjusted,
|
|
+ * we can place the entity.
|
|
+ */
|
|
+ if (!curr)
|
|
place_entity(cfs_rq, se, 0);
|
|
+
|
|
+ account_entity_enqueue(cfs_rq, se);
|
|
+
|
|
/* Entity has migrated, no longer consider this task hot */
|
|
if (flags & ENQUEUE_MIGRATED)
|
|
se->exec_start = 0;
|
|
@@ -5335,23 +5306,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
|
clear_buddies(cfs_rq, se);
|
|
|
|
- if (flags & DEQUEUE_SLEEP)
|
|
- update_entity_lag(cfs_rq, se);
|
|
-
|
|
+ update_entity_lag(cfs_rq, se);
|
|
if (se != cfs_rq->curr)
|
|
__dequeue_entity(cfs_rq, se);
|
|
se->on_rq = 0;
|
|
account_entity_dequeue(cfs_rq, se);
|
|
|
|
- /*
|
|
- * Normalize after update_curr(); which will also have moved
|
|
- * min_vruntime if @se is the one holding it back. But before doing
|
|
- * update_min_vruntime() again, which will discount @se's position and
|
|
- * can move min_vruntime forward still more.
|
|
- */
|
|
- if (!(flags & DEQUEUE_SLEEP))
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
-
|
|
/* return excess runtime on last dequeue */
|
|
return_cfs_rq_runtime(cfs_rq);
|
|
|
|
@@ -8174,18 +8134,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
|
|
{
|
|
struct sched_entity *se = &p->se;
|
|
|
|
- /*
|
|
- * As blocked tasks retain absolute vruntime the migration needs to
|
|
- * deal with this by subtracting the old and adding the new
|
|
- * min_vruntime -- the latter is done by enqueue_entity() when placing
|
|
- * the task on the new runqueue.
|
|
- */
|
|
- if (READ_ONCE(p->__state) == TASK_WAKING) {
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
-
|
|
- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
|
|
- }
|
|
-
|
|
if (!task_on_rq_migrating(p)) {
|
|
remove_entity_load_avg(se);
|
|
|
|
@@ -12554,8 +12502,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
|
|
*/
|
|
static void task_fork_fair(struct task_struct *p)
|
|
{
|
|
- struct cfs_rq *cfs_rq;
|
|
struct sched_entity *se = &p->se, *curr;
|
|
+ struct cfs_rq *cfs_rq;
|
|
struct rq *rq = this_rq();
|
|
struct rq_flags rf;
|
|
|
|
@@ -12564,22 +12512,9 @@ static void task_fork_fair(struct task_struct *p)
|
|
|
|
cfs_rq = task_cfs_rq(current);
|
|
curr = cfs_rq->curr;
|
|
- if (curr) {
|
|
+ if (curr)
|
|
update_curr(cfs_rq);
|
|
- se->vruntime = curr->vruntime;
|
|
- }
|
|
place_entity(cfs_rq, se, 1);
|
|
-
|
|
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
|
|
- /*
|
|
- * Upon rescheduling, sched_class::put_prev_task() will place
|
|
- * 'current' within the tree based on its new key value.
|
|
- */
|
|
- swap(curr->vruntime, se->vruntime);
|
|
- resched_curr(rq);
|
|
- }
|
|
-
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
rq_unlock(rq, &rf);
|
|
}
|
|
|
|
@@ -12608,34 +12543,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
|
check_preempt_curr(rq, p, 0);
|
|
}
|
|
|
|
-static inline bool vruntime_normalized(struct task_struct *p)
|
|
-{
|
|
- struct sched_entity *se = &p->se;
|
|
-
|
|
- /*
|
|
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
|
|
- * the dequeue_entity(.flags=0) will already have normalized the
|
|
- * vruntime.
|
|
- */
|
|
- if (p->on_rq)
|
|
- return true;
|
|
-
|
|
- /*
|
|
- * When !on_rq, vruntime of the task has usually NOT been normalized.
|
|
- * But there are some cases where it has already been normalized:
|
|
- *
|
|
- * - A forked child which is waiting for being woken up by
|
|
- * wake_up_new_task().
|
|
- * - A task which has been woken up by try_to_wake_up() and
|
|
- * waiting for actually being woken up by sched_ttwu_pending().
|
|
- */
|
|
- if (!se->sum_exec_runtime ||
|
|
- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
|
|
- return true;
|
|
-
|
|
- return false;
|
|
-}
|
|
-
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
/*
|
|
* Propagate the changes of the sched_entity across the tg tree to make it
|
|
@@ -12706,16 +12613,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
|
|
static void detach_task_cfs_rq(struct task_struct *p)
|
|
{
|
|
struct sched_entity *se = &p->se;
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
-
|
|
- if (!vruntime_normalized(p)) {
|
|
- /*
|
|
- * Fix up our vruntime so that the current sleep doesn't
|
|
- * cause 'unlimited' sleep bonus.
|
|
- */
|
|
- place_entity(cfs_rq, se, 0);
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
- }
|
|
|
|
detach_entity_cfs_rq(se);
|
|
}
|
|
@@ -12723,12 +12620,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
|
|
static void attach_task_cfs_rq(struct task_struct *p)
|
|
{
|
|
struct sched_entity *se = &p->se;
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
attach_entity_cfs_rq(se);
|
|
-
|
|
- if (!vruntime_normalized(p))
|
|
- se->vruntime += cfs_rq->min_vruntime;
|
|
}
|
|
|
|
static void switched_from_fair(struct rq *rq, struct task_struct *p)
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 991720a7eda97d92a66a5c94fd85617de0307b27 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 31 May 2023 13:58:47 +0200
|
|
Subject: [PATCH 08/28] sched/fair: Commit to EEVDF
|
|
|
|
EEVDF is a better defined scheduling policy, as a result it has less
|
|
heuristics/tunables. There is no compelling reason to keep CFS around.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230531124604.137187212@infradead.org
|
|
---
|
|
kernel/sched/debug.c | 6 -
|
|
kernel/sched/fair.c | 465 ++++------------------------------------
|
|
kernel/sched/features.h | 12 --
|
|
kernel/sched/sched.h | 5 -
|
|
4 files changed, 38 insertions(+), 450 deletions(-)
|
|
|
|
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
|
index d4cca3b2c..b21dc5aab 100644
|
|
--- a/kernel/sched/debug.c
|
|
+++ b/kernel/sched/debug.c
|
|
@@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
|
|
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
|
|
#endif
|
|
|
|
- debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
|
|
debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
|
|
- debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
|
|
- debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
|
|
|
|
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
|
|
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
|
|
@@ -865,10 +862,7 @@ static void sched_debug_header(struct seq_file *m)
|
|
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
|
|
#define PN(x) \
|
|
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
|
|
- PN(sysctl_sched_latency);
|
|
PN(sysctl_sched_min_granularity);
|
|
- PN(sysctl_sched_idle_min_granularity);
|
|
- PN(sysctl_sched_wakeup_granularity);
|
|
P(sysctl_sched_child_runs_first);
|
|
P(sysctl_sched_features);
|
|
#undef PN
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index b7daccfb2..e94cb272d 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -57,22 +57,6 @@
|
|
#include "stats.h"
|
|
#include "autogroup.h"
|
|
|
|
-/*
|
|
- * Targeted preemption latency for CPU-bound tasks:
|
|
- *
|
|
- * NOTE: this latency value is not the same as the concept of
|
|
- * 'timeslice length' - timeslices in CFS are of variable length
|
|
- * and have no persistent notion like in traditional, time-slice
|
|
- * based scheduling concepts.
|
|
- *
|
|
- * (to see the precise effective timeslice length of your workload,
|
|
- * run vmstat and monitor the context-switches (cs) field)
|
|
- *
|
|
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
|
|
- */
|
|
-unsigned int sysctl_sched_latency = 6000000ULL;
|
|
-static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
|
|
-
|
|
/*
|
|
* The initial- and re-scaling of tunables is configurable
|
|
*
|
|
@@ -94,37 +78,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
|
|
unsigned int sysctl_sched_min_granularity = 750000ULL;
|
|
static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
|
|
|
|
-/*
|
|
- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
|
|
- * Applies only when SCHED_IDLE tasks compete with normal tasks.
|
|
- *
|
|
- * (default: 0.75 msec)
|
|
- */
|
|
-unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
|
|
-
|
|
-/*
|
|
- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
|
|
- */
|
|
-static unsigned int sched_nr_latency = 8;
|
|
-
|
|
/*
|
|
* After fork, child runs first. If set to 0 (default) then
|
|
* parent will (try to) run first.
|
|
*/
|
|
unsigned int sysctl_sched_child_runs_first __read_mostly;
|
|
|
|
-/*
|
|
- * SCHED_OTHER wake-up granularity.
|
|
- *
|
|
- * This option delays the preemption effects of decoupled workloads
|
|
- * and reduces their over-scheduling. Synchronous workloads will still
|
|
- * have immediate wakeup/sleep latencies.
|
|
- *
|
|
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
- */
|
|
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
|
|
-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
|
|
-
|
|
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
|
|
|
|
int sched_thermal_decay_shift;
|
|
@@ -279,8 +238,6 @@ static void update_sysctl(void)
|
|
#define SET_SYSCTL(name) \
|
|
(sysctl_##name = (factor) * normalized_sysctl_##name)
|
|
SET_SYSCTL(sched_min_granularity);
|
|
- SET_SYSCTL(sched_latency);
|
|
- SET_SYSCTL(sched_wakeup_granularity);
|
|
#undef SET_SYSCTL
|
|
}
|
|
|
|
@@ -888,30 +845,6 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
|
|
return __node_2_se(left);
|
|
}
|
|
|
|
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
|
|
-{
|
|
- struct rb_node *next = rb_next(&se->run_node);
|
|
-
|
|
- if (!next)
|
|
- return NULL;
|
|
-
|
|
- return __node_2_se(next);
|
|
-}
|
|
-
|
|
-static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
-{
|
|
- struct sched_entity *left = __pick_first_entity(cfs_rq);
|
|
-
|
|
- /*
|
|
- * If curr is set we have to see if its left of the leftmost entity
|
|
- * still in the tree, provided there was anything in the tree at all.
|
|
- */
|
|
- if (!left || (curr && entity_before(curr, left)))
|
|
- left = curr;
|
|
-
|
|
- return left;
|
|
-}
|
|
-
|
|
/*
|
|
* Earliest Eligible Virtual Deadline First
|
|
*
|
|
@@ -1008,85 +941,15 @@ int sched_update_scaling(void)
|
|
{
|
|
unsigned int factor = get_update_sysctl_factor();
|
|
|
|
- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
|
|
- sysctl_sched_min_granularity);
|
|
-
|
|
#define WRT_SYSCTL(name) \
|
|
(normalized_sysctl_##name = sysctl_##name / (factor))
|
|
WRT_SYSCTL(sched_min_granularity);
|
|
- WRT_SYSCTL(sched_latency);
|
|
- WRT_SYSCTL(sched_wakeup_granularity);
|
|
#undef WRT_SYSCTL
|
|
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
-/*
|
|
- * The idea is to set a period in which each task runs once.
|
|
- *
|
|
- * When there are too many tasks (sched_nr_latency) we have to stretch
|
|
- * this period because otherwise the slices get too small.
|
|
- *
|
|
- * p = (nr <= nl) ? l : l*nr/nl
|
|
- */
|
|
-static u64 __sched_period(unsigned long nr_running)
|
|
-{
|
|
- if (unlikely(nr_running > sched_nr_latency))
|
|
- return nr_running * sysctl_sched_min_granularity;
|
|
- else
|
|
- return sysctl_sched_latency;
|
|
-}
|
|
-
|
|
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
|
|
-
|
|
-/*
|
|
- * We calculate the wall-time slice from the period by taking a part
|
|
- * proportional to the weight.
|
|
- *
|
|
- * s = p*P[w/rw]
|
|
- */
|
|
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
-{
|
|
- unsigned int nr_running = cfs_rq->nr_running;
|
|
- struct sched_entity *init_se = se;
|
|
- unsigned int min_gran;
|
|
- u64 slice;
|
|
-
|
|
- if (sched_feat(ALT_PERIOD))
|
|
- nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
|
|
-
|
|
- slice = __sched_period(nr_running + !se->on_rq);
|
|
-
|
|
- for_each_sched_entity(se) {
|
|
- struct load_weight *load;
|
|
- struct load_weight lw;
|
|
- struct cfs_rq *qcfs_rq;
|
|
-
|
|
- qcfs_rq = cfs_rq_of(se);
|
|
- load = &qcfs_rq->load;
|
|
-
|
|
- if (unlikely(!se->on_rq)) {
|
|
- lw = qcfs_rq->load;
|
|
-
|
|
- update_load_add(&lw, se->load.weight);
|
|
- load = &lw;
|
|
- }
|
|
- slice = __calc_delta(slice, se->load.weight, load);
|
|
- }
|
|
-
|
|
- if (sched_feat(BASE_SLICE)) {
|
|
- if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
|
|
- min_gran = sysctl_sched_idle_min_granularity;
|
|
- else
|
|
- min_gran = sysctl_sched_min_granularity;
|
|
-
|
|
- slice = max_t(u64, slice, min_gran);
|
|
- }
|
|
-
|
|
- return slice;
|
|
-}
|
|
-
|
|
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
|
|
|
|
/*
|
|
@@ -1098,35 +961,25 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
if ((s64)(se->vruntime - se->deadline) < 0)
|
|
return;
|
|
|
|
- if (sched_feat(EEVDF)) {
|
|
- /*
|
|
- * For EEVDF the virtual time slope is determined by w_i (iow.
|
|
- * nice) while the request time r_i is determined by
|
|
- * sysctl_sched_min_granularity.
|
|
- */
|
|
- se->slice = sysctl_sched_min_granularity;
|
|
-
|
|
- /*
|
|
- * The task has consumed its request, reschedule.
|
|
- */
|
|
- if (cfs_rq->nr_running > 1) {
|
|
- resched_curr(rq_of(cfs_rq));
|
|
- clear_buddies(cfs_rq, se);
|
|
- }
|
|
- } else {
|
|
- /*
|
|
- * When many tasks blow up the sched_period; it is possible
|
|
- * that sched_slice() reports unusually large results (when
|
|
- * many tasks are very light for example). Therefore impose a
|
|
- * maximum.
|
|
- */
|
|
- se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
|
|
- }
|
|
+ /*
|
|
+ * For EEVDF the virtual time slope is determined by w_i (iow.
|
|
+ * nice) while the request time r_i is determined by
|
|
+ * sysctl_sched_min_granularity.
|
|
+ */
|
|
+ se->slice = sysctl_sched_min_granularity;
|
|
|
|
/*
|
|
* EEVDF: vd_i = ve_i + r_i / w_i
|
|
*/
|
|
se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
|
|
+
|
|
+ /*
|
|
+ * The task has consumed its request, reschedule.
|
|
+ */
|
|
+ if (cfs_rq->nr_running > 1) {
|
|
+ resched_curr(rq_of(cfs_rq));
|
|
+ clear_buddies(cfs_rq, se);
|
|
+ }
|
|
}
|
|
|
|
#include "pelt.h"
|
|
@@ -5055,19 +4908,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
-{
|
|
-#ifdef CONFIG_SCHED_DEBUG
|
|
- s64 d = se->vruntime - cfs_rq->min_vruntime;
|
|
-
|
|
- if (d < 0)
|
|
- d = -d;
|
|
-
|
|
- if (d > 3*sysctl_sched_latency)
|
|
- schedstat_inc(cfs_rq->nr_spread_over);
|
|
-#endif
|
|
-}
|
|
-
|
|
static void
|
|
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
{
|
|
@@ -5218,7 +5058,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
|
check_schedstat_required();
|
|
update_stats_enqueue_fair(cfs_rq, se, flags);
|
|
- check_spread(cfs_rq, se);
|
|
if (!curr)
|
|
__enqueue_entity(cfs_rq, se);
|
|
se->on_rq = 1;
|
|
@@ -5230,17 +5069,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
}
|
|
}
|
|
|
|
-static void __clear_buddies_last(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se) {
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- if (cfs_rq->last != se)
|
|
- break;
|
|
-
|
|
- cfs_rq->last = NULL;
|
|
- }
|
|
-}
|
|
-
|
|
static void __clear_buddies_next(struct sched_entity *se)
|
|
{
|
|
for_each_sched_entity(se) {
|
|
@@ -5252,27 +5080,10 @@ static void __clear_buddies_next(struct sched_entity *se)
|
|
}
|
|
}
|
|
|
|
-static void __clear_buddies_skip(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se) {
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- if (cfs_rq->skip != se)
|
|
- break;
|
|
-
|
|
- cfs_rq->skip = NULL;
|
|
- }
|
|
-}
|
|
-
|
|
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- if (cfs_rq->last == se)
|
|
- __clear_buddies_last(se);
|
|
-
|
|
if (cfs_rq->next == se)
|
|
__clear_buddies_next(se);
|
|
-
|
|
- if (cfs_rq->skip == se)
|
|
- __clear_buddies_skip(se);
|
|
}
|
|
|
|
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
|
|
@@ -5330,45 +5141,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
update_idle_cfs_rq_clock_pelt(cfs_rq);
|
|
}
|
|
|
|
-/*
|
|
- * Preempt the current task with a newly woken task if needed:
|
|
- */
|
|
-static void
|
|
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
-{
|
|
- unsigned long delta_exec;
|
|
- struct sched_entity *se;
|
|
- s64 delta;
|
|
-
|
|
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
|
|
- if (delta_exec > curr->slice) {
|
|
- resched_curr(rq_of(cfs_rq));
|
|
- /*
|
|
- * The current task ran long enough, ensure it doesn't get
|
|
- * re-elected due to buddy favours.
|
|
- */
|
|
- clear_buddies(cfs_rq, curr);
|
|
- return;
|
|
- }
|
|
-
|
|
- /*
|
|
- * Ensure that a task that missed wakeup preemption by a
|
|
- * narrow margin doesn't have to wait for a full slice.
|
|
- * This also mitigates buddy induced latencies under load.
|
|
- */
|
|
- if (delta_exec < sysctl_sched_min_granularity)
|
|
- return;
|
|
-
|
|
- se = __pick_first_entity(cfs_rq);
|
|
- delta = curr->vruntime - se->vruntime;
|
|
-
|
|
- if (delta < 0)
|
|
- return;
|
|
-
|
|
- if (delta > curr->slice)
|
|
- resched_curr(rq_of(cfs_rq));
|
|
-}
|
|
-
|
|
static void
|
|
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
@@ -5407,9 +5179,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
se->prev_sum_exec_runtime = se->sum_exec_runtime;
|
|
}
|
|
|
|
-static int
|
|
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
|
|
-
|
|
/*
|
|
* Pick the next process, keeping these things in mind, in this order:
|
|
* 1) keep things fair between processes/task groups
|
|
@@ -5420,53 +5189,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
|
|
static struct sched_entity *
|
|
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
{
|
|
- struct sched_entity *left, *se;
|
|
-
|
|
- if (sched_feat(EEVDF)) {
|
|
- /*
|
|
- * Enabling NEXT_BUDDY will affect latency but not fairness.
|
|
- */
|
|
- if (sched_feat(NEXT_BUDDY) &&
|
|
- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
|
|
- return cfs_rq->next;
|
|
-
|
|
- return pick_eevdf(cfs_rq);
|
|
- }
|
|
-
|
|
- se = left = pick_cfs(cfs_rq, curr);
|
|
-
|
|
/*
|
|
- * Avoid running the skip buddy, if running something else can
|
|
- * be done without getting too unfair.
|
|
+ * Enabling NEXT_BUDDY will affect latency but not fairness.
|
|
*/
|
|
- if (cfs_rq->skip && cfs_rq->skip == se) {
|
|
- struct sched_entity *second;
|
|
+ if (sched_feat(NEXT_BUDDY) &&
|
|
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
|
|
+ return cfs_rq->next;
|
|
|
|
- if (se == curr) {
|
|
- second = __pick_first_entity(cfs_rq);
|
|
- } else {
|
|
- second = __pick_next_entity(se);
|
|
- if (!second || (curr && entity_before(curr, second)))
|
|
- second = curr;
|
|
- }
|
|
-
|
|
- if (second && wakeup_preempt_entity(second, left) < 1)
|
|
- se = second;
|
|
- }
|
|
-
|
|
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
|
|
- /*
|
|
- * Someone really wants this to run. If it's not unfair, run it.
|
|
- */
|
|
- se = cfs_rq->next;
|
|
- } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
|
|
- /*
|
|
- * Prefer last buddy, try to return the CPU to a preempted task.
|
|
- */
|
|
- se = cfs_rq->last;
|
|
- }
|
|
-
|
|
- return se;
|
|
+ return pick_eevdf(cfs_rq);
|
|
}
|
|
|
|
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
|
|
@@ -5483,8 +5213,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
|
|
/* throttle cfs_rqs exceeding runtime */
|
|
check_cfs_rq_runtime(cfs_rq);
|
|
|
|
- check_spread(cfs_rq, prev);
|
|
-
|
|
if (prev->on_rq) {
|
|
update_stats_wait_start_fair(cfs_rq, prev);
|
|
/* Put 'current' back into the tree. */
|
|
@@ -5525,9 +5253,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
|
|
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
|
|
return;
|
|
#endif
|
|
-
|
|
- if (!sched_feat(EEVDF) && cfs_rq->nr_running > 1)
|
|
- check_preempt_tick(cfs_rq, curr);
|
|
}
|
|
|
|
|
|
@@ -6579,8 +6304,7 @@ static void hrtick_update(struct rq *rq)
|
|
if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
|
|
return;
|
|
|
|
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
|
|
- hrtick_start_fair(rq, curr);
|
|
+ hrtick_start_fair(rq, curr);
|
|
}
|
|
#else /* !CONFIG_SCHED_HRTICK */
|
|
static inline void
|
|
@@ -6621,17 +6345,6 @@ static int sched_idle_rq(struct rq *rq)
|
|
rq->nr_running);
|
|
}
|
|
|
|
-/*
|
|
- * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
|
|
- * of idle_nr_running, which does not consider idle descendants of normal
|
|
- * entities.
|
|
- */
|
|
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
|
|
-{
|
|
- return cfs_rq->nr_running &&
|
|
- cfs_rq->nr_running == cfs_rq->idle_nr_running;
|
|
-}
|
|
-
|
|
#ifdef CONFIG_SMP
|
|
static int sched_idle_cpu(int cpu)
|
|
{
|
|
@@ -8171,66 +7884,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|
}
|
|
#endif /* CONFIG_SMP */
|
|
|
|
-static unsigned long wakeup_gran(struct sched_entity *se)
|
|
-{
|
|
- unsigned long gran = sysctl_sched_wakeup_granularity;
|
|
-
|
|
- /*
|
|
- * Since its curr running now, convert the gran from real-time
|
|
- * to virtual-time in his units.
|
|
- *
|
|
- * By using 'se' instead of 'curr' we penalize light tasks, so
|
|
- * they get preempted easier. That is, if 'se' < 'curr' then
|
|
- * the resulting gran will be larger, therefore penalizing the
|
|
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
|
|
- * be smaller, again penalizing the lighter task.
|
|
- *
|
|
- * This is especially important for buddies when the leftmost
|
|
- * task is higher priority than the buddy.
|
|
- */
|
|
- return calc_delta_fair(gran, se);
|
|
-}
|
|
-
|
|
-/*
|
|
- * Should 'se' preempt 'curr'.
|
|
- *
|
|
- * |s1
|
|
- * |s2
|
|
- * |s3
|
|
- * g
|
|
- * |<--->|c
|
|
- *
|
|
- * w(c, s1) = -1
|
|
- * w(c, s2) = 0
|
|
- * w(c, s3) = 1
|
|
- *
|
|
- */
|
|
-static int
|
|
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
|
|
-{
|
|
- s64 gran, vdiff = curr->vruntime - se->vruntime;
|
|
-
|
|
- if (vdiff <= 0)
|
|
- return -1;
|
|
-
|
|
- gran = wakeup_gran(se);
|
|
- if (vdiff > gran)
|
|
- return 1;
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static void set_last_buddy(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se) {
|
|
- if (SCHED_WARN_ON(!se->on_rq))
|
|
- return;
|
|
- if (se_is_idle(se))
|
|
- return;
|
|
- cfs_rq_of(se)->last = se;
|
|
- }
|
|
-}
|
|
-
|
|
static void set_next_buddy(struct sched_entity *se)
|
|
{
|
|
for_each_sched_entity(se) {
|
|
@@ -8242,12 +7895,6 @@ static void set_next_buddy(struct sched_entity *se)
|
|
}
|
|
}
|
|
|
|
-static void set_skip_buddy(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se)
|
|
- cfs_rq_of(se)->skip = se;
|
|
-}
|
|
-
|
|
/*
|
|
* Preempt the current task with a newly woken task if needed:
|
|
*/
|
|
@@ -8256,7 +7903,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|
struct task_struct *curr = rq->curr;
|
|
struct sched_entity *se = &curr->se, *pse = &p->se;
|
|
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
|
|
- int scale = cfs_rq->nr_running >= sched_nr_latency;
|
|
int next_buddy_marked = 0;
|
|
int cse_is_idle, pse_is_idle;
|
|
|
|
@@ -8272,7 +7918,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
|
|
return;
|
|
|
|
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
|
|
+ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
|
|
set_next_buddy(pse);
|
|
next_buddy_marked = 1;
|
|
}
|
|
@@ -8320,44 +7966,16 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|
cfs_rq = cfs_rq_of(se);
|
|
update_curr(cfs_rq);
|
|
|
|
- if (sched_feat(EEVDF)) {
|
|
- /*
|
|
- * XXX pick_eevdf(cfs_rq) != se ?
|
|
- */
|
|
- if (pick_eevdf(cfs_rq) == pse)
|
|
- goto preempt;
|
|
-
|
|
- return;
|
|
- }
|
|
-
|
|
- if (wakeup_preempt_entity(se, pse) == 1) {
|
|
- /*
|
|
- * Bias pick_next to pick the sched entity that is
|
|
- * triggering this preemption.
|
|
- */
|
|
- if (!next_buddy_marked)
|
|
- set_next_buddy(pse);
|
|
+ /*
|
|
+ * XXX pick_eevdf(cfs_rq) != se ?
|
|
+ */
|
|
+ if (pick_eevdf(cfs_rq) == pse)
|
|
goto preempt;
|
|
- }
|
|
|
|
return;
|
|
|
|
preempt:
|
|
resched_curr(rq);
|
|
- /*
|
|
- * Only set the backward buddy when the current task is still
|
|
- * on the rq. This can happen when a wakeup gets interleaved
|
|
- * with schedule on the ->pre_schedule() or idle_balance()
|
|
- * point, either of which can * drop the rq lock.
|
|
- *
|
|
- * Also, during early boot the idle thread is in the fair class,
|
|
- * for obvious reasons its a bad idea to schedule back to it.
|
|
- */
|
|
- if (unlikely(!se->on_rq || curr == rq->idle))
|
|
- return;
|
|
-
|
|
- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
|
|
- set_last_buddy(se);
|
|
}
|
|
|
|
#ifdef CONFIG_SMP
|
|
@@ -8558,8 +8176,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
|
|
|
|
/*
|
|
* sched_yield() is very simple
|
|
- *
|
|
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
|
|
*/
|
|
static void yield_task_fair(struct rq *rq)
|
|
{
|
|
@@ -8575,23 +8191,19 @@ static void yield_task_fair(struct rq *rq)
|
|
|
|
clear_buddies(cfs_rq, se);
|
|
|
|
- if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) {
|
|
- update_rq_clock(rq);
|
|
- /*
|
|
- * Update run-time statistics of the 'current'.
|
|
- */
|
|
- update_curr(cfs_rq);
|
|
- /*
|
|
- * Tell update_rq_clock() that we've just updated,
|
|
- * so we don't do microscopic update in schedule()
|
|
- * and double the fastpath cost.
|
|
- */
|
|
- rq_clock_skip_update(rq);
|
|
- }
|
|
- if (sched_feat(EEVDF))
|
|
- se->deadline += calc_delta_fair(se->slice, se);
|
|
+ update_rq_clock(rq);
|
|
+ /*
|
|
+ * Update run-time statistics of the 'current'.
|
|
+ */
|
|
+ update_curr(cfs_rq);
|
|
+ /*
|
|
+ * Tell update_rq_clock() that we've just updated,
|
|
+ * so we don't do microscopic update in schedule()
|
|
+ * and double the fastpath cost.
|
|
+ */
|
|
+ rq_clock_skip_update(rq);
|
|
|
|
- set_skip_buddy(se);
|
|
+ se->deadline += calc_delta_fair(se->slice, se);
|
|
}
|
|
|
|
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
|
|
@@ -8834,8 +8446,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
|
|
* Buddy candidates are cache hot:
|
|
*/
|
|
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
|
|
- (&p->se == cfs_rq_of(&p->se)->next ||
|
|
- &p->se == cfs_rq_of(&p->se)->last))
|
|
+ (&p->se == cfs_rq_of(&p->se)->next))
|
|
return 1;
|
|
|
|
if (sysctl_sched_migration_cost == -1)
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index 2a830eccd..54334ca5c 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -14,13 +14,6 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
|
|
*/
|
|
SCHED_FEAT(NEXT_BUDDY, false)
|
|
|
|
-/*
|
|
- * Prefer to schedule the task that ran last (when we did
|
|
- * wake-preempt) as that likely will touch the same data, increases
|
|
- * cache locality.
|
|
- */
|
|
-SCHED_FEAT(LAST_BUDDY, true)
|
|
-
|
|
/*
|
|
* Consider buddies to be cache hot, decreases the likeliness of a
|
|
* cache buddy being migrated away, increases cache locality.
|
|
@@ -93,8 +86,3 @@ SCHED_FEAT(UTIL_EST, true)
|
|
SCHED_FEAT(UTIL_EST_FASTUP, true)
|
|
|
|
SCHED_FEAT(LATENCY_WARN, false)
|
|
-
|
|
-SCHED_FEAT(ALT_PERIOD, true)
|
|
-SCHED_FEAT(BASE_SLICE, true)
|
|
-
|
|
-SCHED_FEAT(EEVDF, true)
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index 1fc81dd7f..83bbcd35c 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -570,8 +570,6 @@ struct cfs_rq {
|
|
*/
|
|
struct sched_entity *curr;
|
|
struct sched_entity *next;
|
|
- struct sched_entity *last;
|
|
- struct sched_entity *skip;
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
unsigned int nr_spread_over;
|
|
@@ -2505,9 +2503,6 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
|
|
extern unsigned int sysctl_sched_min_granularity;
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
-extern unsigned int sysctl_sched_latency;
|
|
-extern unsigned int sysctl_sched_idle_min_granularity;
|
|
-extern unsigned int sysctl_sched_wakeup_granularity;
|
|
extern int sysctl_resched_latency_warn_ms;
|
|
extern int sysctl_resched_latency_warn_once;
|
|
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 80d62dca8d49a1a1de964786d19b350b7e910365 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 31 May 2023 13:58:48 +0200
|
|
Subject: [PATCH 09/28] sched/debug: Rename sysctl_sched_min_granularity to
|
|
sysctl_sched_base_slice
|
|
|
|
EEVDF uses this tunable as the base request/slice -- make sure the
|
|
name reflects this.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230531124604.205287511@infradead.org
|
|
---
|
|
kernel/sched/core.c | 2 +-
|
|
kernel/sched/debug.c | 4 ++--
|
|
kernel/sched/fair.c | 12 ++++++------
|
|
kernel/sched/sched.h | 2 +-
|
|
4 files changed, 10 insertions(+), 10 deletions(-)
|
|
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index 427d694ff..be77d999d 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -4502,7 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|
p->se.nr_migrations = 0;
|
|
p->se.vruntime = 0;
|
|
p->se.vlag = 0;
|
|
- p->se.slice = sysctl_sched_min_granularity;
|
|
+ p->se.slice = sysctl_sched_base_slice;
|
|
INIT_LIST_HEAD(&p->se.group_node);
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
|
index b21dc5aab..2c5bb64f5 100644
|
|
--- a/kernel/sched/debug.c
|
|
+++ b/kernel/sched/debug.c
|
|
@@ -347,7 +347,7 @@ static __init int sched_init_debug(void)
|
|
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
|
|
#endif
|
|
|
|
- debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
|
|
+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
|
|
|
|
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
|
|
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
|
|
@@ -862,7 +862,7 @@ static void sched_debug_header(struct seq_file *m)
|
|
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
|
|
#define PN(x) \
|
|
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
|
|
- PN(sysctl_sched_min_granularity);
|
|
+ PN(sysctl_sched_base_slice);
|
|
P(sysctl_sched_child_runs_first);
|
|
P(sysctl_sched_features);
|
|
#undef PN
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index e94cb272d..c4244989e 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -75,8 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
|
|
*
|
|
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
*/
|
|
-unsigned int sysctl_sched_min_granularity = 750000ULL;
|
|
-static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
|
|
+unsigned int sysctl_sched_base_slice = 750000ULL;
|
|
+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
|
|
|
|
/*
|
|
* After fork, child runs first. If set to 0 (default) then
|
|
@@ -237,7 +237,7 @@ static void update_sysctl(void)
|
|
|
|
#define SET_SYSCTL(name) \
|
|
(sysctl_##name = (factor) * normalized_sysctl_##name)
|
|
- SET_SYSCTL(sched_min_granularity);
|
|
+ SET_SYSCTL(sched_base_slice);
|
|
#undef SET_SYSCTL
|
|
}
|
|
|
|
@@ -943,7 +943,7 @@ int sched_update_scaling(void)
|
|
|
|
#define WRT_SYSCTL(name) \
|
|
(normalized_sysctl_##name = sysctl_##name / (factor))
|
|
- WRT_SYSCTL(sched_min_granularity);
|
|
+ WRT_SYSCTL(sched_base_slice);
|
|
#undef WRT_SYSCTL
|
|
|
|
return 0;
|
|
@@ -964,9 +964,9 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
/*
|
|
* For EEVDF the virtual time slope is determined by w_i (iow.
|
|
* nice) while the request time r_i is determined by
|
|
- * sysctl_sched_min_granularity.
|
|
+ * sysctl_sched_base_slice.
|
|
*/
|
|
- se->slice = sysctl_sched_min_granularity;
|
|
+ se->slice = sysctl_sched_base_slice;
|
|
|
|
/*
|
|
* EEVDF: vd_i = ve_i + r_i / w_i
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index 83bbcd35c..e21f6a048 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -2500,7 +2500,7 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
|
|
extern const_debug unsigned int sysctl_sched_nr_migrate;
|
|
extern const_debug unsigned int sysctl_sched_migration_cost;
|
|
|
|
-extern unsigned int sysctl_sched_min_granularity;
|
|
+extern unsigned int sysctl_sched_base_slice;
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
extern int sysctl_resched_latency_warn_ms;
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 1d3a784709658acd993cab0118c3a251321aaea3 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 31 May 2023 13:58:49 +0200
|
|
Subject: [PATCH 10/28] sched/fair: Propagate enqueue flags into place_entity()
|
|
|
|
This allows place_entity() to consider ENQUEUE_WAKEUP and
|
|
ENQUEUE_MIGRATED.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230531124604.274010996@infradead.org
|
|
---
|
|
kernel/sched/fair.c | 10 +++++-----
|
|
kernel/sched/sched.h | 1 +
|
|
2 files changed, 6 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index c4244989e..7dd9abc63 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -4909,7 +4909,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
|
|
#endif /* CONFIG_SMP */
|
|
|
|
static void
|
|
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
{
|
|
u64 vslice = calc_delta_fair(se->slice, se);
|
|
u64 vruntime = avg_vruntime(cfs_rq);
|
|
@@ -4998,7 +4998,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
* on average, halfway through their slice, as such start tasks
|
|
* off with half a slice to ease into the competition.
|
|
*/
|
|
- if (sched_feat(PLACE_DEADLINE_INITIAL) && initial)
|
|
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
|
|
vslice /= 2;
|
|
|
|
/*
|
|
@@ -5021,7 +5021,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
* update_curr().
|
|
*/
|
|
if (curr)
|
|
- place_entity(cfs_rq, se, 0);
|
|
+ place_entity(cfs_rq, se, flags);
|
|
|
|
update_curr(cfs_rq);
|
|
|
|
@@ -5048,7 +5048,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
* we can place the entity.
|
|
*/
|
|
if (!curr)
|
|
- place_entity(cfs_rq, se, 0);
|
|
+ place_entity(cfs_rq, se, flags);
|
|
|
|
account_entity_enqueue(cfs_rq, se);
|
|
|
|
@@ -12125,7 +12125,7 @@ static void task_fork_fair(struct task_struct *p)
|
|
curr = cfs_rq->curr;
|
|
if (curr)
|
|
update_curr(cfs_rq);
|
|
- place_entity(cfs_rq, se, 1);
|
|
+ place_entity(cfs_rq, se, ENQUEUE_INITIAL);
|
|
rq_unlock(rq, &rf);
|
|
}
|
|
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index e21f6a048..576d371c8 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -2196,6 +2196,7 @@ extern const u32 sched_prio_to_wmult[40];
|
|
#else
|
|
#define ENQUEUE_MIGRATED 0x00
|
|
#endif
|
|
+#define ENQUEUE_INITIAL 0x80
|
|
|
|
#define RETRY_TASK ((void *)-1UL)
|
|
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 5b1ff22164a1098fd0a71a8bbc2e14387df3950b Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 16 Aug 2023 15:40:59 +0200
|
|
Subject: [PATCH 11/28] sched/eevdf: Curb wakeup-preemption
|
|
|
|
Mike and others noticed that EEVDF does like to over-schedule quite a
|
|
bit -- which does hurt performance of a number of benchmarks /
|
|
workloads.
|
|
|
|
In particular, what seems to cause over-scheduling is that when lag is
|
|
of the same order (or larger) than the request / slice then placement
|
|
will not only cause the task to be placed left of current, but also
|
|
with a smaller deadline than current, which causes immediate
|
|
preemption.
|
|
|
|
[ notably, lag bounds are relative to HZ ]
|
|
|
|
Mike suggested we stick to picking 'current' for as long as it's
|
|
eligible to run, giving it uninterrupted runtime until it reaches
|
|
parity with the pack.
|
|
|
|
Augment Mike's suggestion by only allowing it to exhaust it's initial
|
|
request.
|
|
|
|
One random data point:
|
|
|
|
echo NO_RUN_TO_PARITY > /debug/sched/features
|
|
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
|
|
|
|
3,723,554 context-switches ( +- 0.56% )
|
|
9.5136 +- 0.0394 seconds time elapsed ( +- 0.41% )
|
|
|
|
echo RUN_TO_PARITY > /debug/sched/features
|
|
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
|
|
|
|
2,556,535 context-switches ( +- 0.51% )
|
|
9.2427 +- 0.0302 seconds time elapsed ( +- 0.33% )
|
|
|
|
Suggested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net
|
|
---
|
|
kernel/sched/fair.c | 12 ++++++++++++
|
|
kernel/sched/features.h | 1 +
|
|
2 files changed, 13 insertions(+)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 7dd9abc63..1cdc95725 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -873,6 +873,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
|
|
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
|
|
curr = NULL;
|
|
|
|
+ /*
|
|
+ * Once selected, run a task until it either becomes non-eligible or
|
|
+ * until it gets a new slice. See the HACK in set_next_entity().
|
|
+ */
|
|
+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
|
|
+ return curr;
|
|
+
|
|
while (node) {
|
|
struct sched_entity *se = __node_2_se(node);
|
|
|
|
@@ -5156,6 +5163,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
update_stats_wait_end_fair(cfs_rq, se);
|
|
__dequeue_entity(cfs_rq, se);
|
|
update_load_avg(cfs_rq, se, UPDATE_TG);
|
|
+ /*
|
|
+ * HACK, stash a copy of deadline at the point of pick in vlag,
|
|
+ * which isn't used until dequeue.
|
|
+ */
|
|
+ se->vlag = se->deadline;
|
|
}
|
|
|
|
update_stats_curr_start(cfs_rq, se);
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index 54334ca5c..546d212ef 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -6,6 +6,7 @@
|
|
*/
|
|
SCHED_FEAT(PLACE_LAG, true)
|
|
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
|
|
+SCHED_FEAT(RUN_TO_PARITY, true)
|
|
|
|
/*
|
|
* Prefer to schedule the task we woke last (assuming it failed
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 1989d7c6cb34c8e293574249320ee716bb5b47b9 Mon Sep 17 00:00:00 2001
|
|
From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
|
|
Date: Thu, 24 Aug 2023 13:33:42 +0530
|
|
Subject: [PATCH 12/28] sched/eevdf/doc: Modify the documented knob to
|
|
base_slice_ns as well
|
|
|
|
After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns'
|
|
sysctl to 'base_slice_ns':
|
|
|
|
e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
|
|
|
|
... but we forgot to rename it in the documentation. Do that now.
|
|
|
|
Fixes: e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
|
|
Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
Link: https://lore.kernel.org/r/20230824080342.543396-1-sshegde@linux.vnet.ibm.com
|
|
---
|
|
Documentation/scheduler/sched-design-CFS.rst | 2 +-
|
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
|
|
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
|
|
index 03db55504..f68919800 100644
|
|
--- a/Documentation/scheduler/sched-design-CFS.rst
|
|
+++ b/Documentation/scheduler/sched-design-CFS.rst
|
|
@@ -94,7 +94,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
|
|
way the previous scheduler had, and has no heuristics whatsoever. There is
|
|
only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
|
|
|
|
- /sys/kernel/debug/sched/min_granularity_ns
|
|
+ /sys/kernel/debug/sched/base_slice_ns
|
|
|
|
which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
|
|
"server" (i.e., good batching) workloads. It defaults to a setting suitable
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From eb9c5a4550dcb41730ede47d1554fcc634d3463a Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Fri, 15 Sep 2023 00:48:55 +0200
|
|
Subject: [PATCH 13/28] sched/eevdf: Also update slice on placement
|
|
|
|
Tasks that never consume their full slice would not update their slice value.
|
|
This means that tasks that are spawned before the sysctl scaling keep their
|
|
original (UP) slice length.
|
|
|
|
Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Link: https://lkml.kernel.org/r/20230915124822.847197830@noisy.programming.kicks-ass.net
|
|
---
|
|
kernel/sched/fair.c | 6 ++++--
|
|
1 file changed, 4 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 1cdc95725..efbcdc69c 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -4918,10 +4918,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
|
|
static void
|
|
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
{
|
|
- u64 vslice = calc_delta_fair(se->slice, se);
|
|
- u64 vruntime = avg_vruntime(cfs_rq);
|
|
+ u64 vslice, vruntime = avg_vruntime(cfs_rq);
|
|
s64 lag = 0;
|
|
|
|
+ se->slice = sysctl_sched_base_slice;
|
|
+ vslice = calc_delta_fair(se->slice, se);
|
|
+
|
|
/*
|
|
* Due to how V is constructed as the weighted average of entities,
|
|
* adding tasks with positive lag, or removing tasks with negative lag
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 9bf86297a3305c99ef82aa77fef7b39c8cc763c1 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Tue, 26 Sep 2023 14:29:50 +0200
|
|
Subject: [PATCH 14/28] sched/eevdf: Fix avg_vruntime()
|
|
|
|
The expectation is that placing a task at avg_vruntime() makes it
|
|
eligible. Turns out there is a corner case where this is not the case.
|
|
|
|
Specifically, avg_vruntime() relies on the fact that integer division
|
|
is a flooring function (eg. it discards the remainder). By this
|
|
property the value returned is slightly left of the true average.
|
|
|
|
However! when the average is a negative (relative to min_vruntime) the
|
|
effect is flipped and it becomes a ceil, with the result that the
|
|
returned value is just right of the average and thus not eligible.
|
|
|
|
Fixes: af4cf40470c2 ("sched/fair: Add cfs_rq::avg_vruntime")
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
---
|
|
kernel/sched/fair.c | 10 +++++++++-
|
|
1 file changed, 9 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index efbcdc69c..9dbf3ce61 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -664,6 +664,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
|
|
cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
|
|
}
|
|
|
|
+/*
|
|
+ * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
|
|
+ * For this to be so, the result of this function must have a left bias.
|
|
+ */
|
|
u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
|
{
|
|
struct sched_entity *curr = cfs_rq->curr;
|
|
@@ -677,8 +681,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
|
load += weight;
|
|
}
|
|
|
|
- if (load)
|
|
+ if (load) {
|
|
+ /* sign flips effective floor / ceil */
|
|
+ if (avg < 0)
|
|
+ avg -= (load - 1);
|
|
avg = div_s64(avg, load);
|
|
+ }
|
|
|
|
return cfs_rq->min_vruntime + avg;
|
|
}
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From a23808a0a84a235f0471ad3a2b9bb3398792bf44 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Fri, 6 Oct 2023 21:24:45 +0200
|
|
Subject: [PATCH 15/28] sched/eevdf: Fix min_deadline heap integrity
|
|
|
|
Marek and Biju reported instances of:
|
|
|
|
"EEVDF scheduling fail, picking leftmost"
|
|
|
|
which Mike correlated with cgroup scheduling and the min_deadline heap
|
|
getting corrupted; some trace output confirms:
|
|
|
|
> And yeah, min_deadline is hosed somehow:
|
|
>
|
|
> validate_cfs_rq: --- /
|
|
> __print_se: ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -66302085 E (11372/tr)
|
|
> __print_se: ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -66302085 E (-1//autogroup-31)
|
|
> __print_se: ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N (-1//autogroup-33)
|
|
> validate_cfs_rq: min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256
|
|
|
|
Turns out that reweight_entity(), which tries really hard to be fast,
|
|
does not do the normal dequeue+update+enqueue pattern but *does* scale
|
|
the deadline.
|
|
|
|
However, it then fails to propagate the updated deadline value up the
|
|
heap.
|
|
|
|
Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
|
|
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
|
|
Reported-by: Biju Das <biju.das.jz@bp.renesas.com>
|
|
Reported-by: Mike Galbraith <efault@gmx.de>
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Link: https://lkml.kernel.org/r/20231006192445.GE743@noisy.programming.kicks-ass.net
|
|
---
|
|
kernel/sched/fair.c | 1 +
|
|
1 file changed, 1 insertion(+)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 9dbf3ce61..a0f1d9578 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -3612,6 +3612,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
*/
|
|
deadline = div_s64(deadline * old_weight, weight);
|
|
se->deadline = se->vruntime + deadline;
|
|
+ min_deadline_cb_propagate(&se->run_node, NULL);
|
|
}
|
|
|
|
#ifdef CONFIG_SMP
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 4e6428fd79ddf1f53e6171ead190c359551cdfda Mon Sep 17 00:00:00 2001
|
|
From: Benjamin Segall <bsegall@google.com>
|
|
Date: Fri, 29 Sep 2023 17:09:30 -0700
|
|
Subject: [PATCH 16/28] sched/eevdf: Fix pick_eevdf()
|
|
|
|
The old pick_eevdf() could fail to find the actual earliest eligible
|
|
deadline when it descended to the right looking for min_deadline, but
|
|
it turned out that that min_deadline wasn't actually eligible. In that
|
|
case we need to go back and search through any left branches we
|
|
skipped looking for the actual best _eligible_ min_deadline.
|
|
|
|
This is more expensive, but still O(log n), and at worst should only
|
|
involve descending two branches of the rbtree.
|
|
|
|
I've run this through a userspace stress test (thank you
|
|
tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
|
|
corner cases.
|
|
|
|
Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
|
|
Signed-off-by: Ben Segall <bsegall@google.com>
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Link: https://lkml.kernel.org/r/xm261qego72d.fsf_-_@google.com
|
|
---
|
|
kernel/sched/fair.c | 72 ++++++++++++++++++++++++++++++++++++---------
|
|
1 file changed, 58 insertions(+), 14 deletions(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index a0f1d9578..caec9b43c 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -872,14 +872,16 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
|
|
*
|
|
* Which allows an EDF like search on (sub)trees.
|
|
*/
|
|
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
|
|
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
|
|
{
|
|
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
|
|
struct sched_entity *curr = cfs_rq->curr;
|
|
struct sched_entity *best = NULL;
|
|
+ struct sched_entity *best_left = NULL;
|
|
|
|
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
|
|
curr = NULL;
|
|
+ best = curr;
|
|
|
|
/*
|
|
* Once selected, run a task until it either becomes non-eligible or
|
|
@@ -900,33 +902,75 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
|
|
}
|
|
|
|
/*
|
|
- * If this entity has an earlier deadline than the previous
|
|
- * best, take this one. If it also has the earliest deadline
|
|
- * of its subtree, we're done.
|
|
+ * Now we heap search eligible trees for the best (min_)deadline
|
|
*/
|
|
- if (!best || deadline_gt(deadline, best, se)) {
|
|
+ if (!best || deadline_gt(deadline, best, se))
|
|
best = se;
|
|
- if (best->deadline == best->min_deadline)
|
|
- break;
|
|
- }
|
|
|
|
/*
|
|
- * If the earlest deadline in this subtree is in the fully
|
|
- * eligible left half of our space, go there.
|
|
+ * Every se in a left branch is eligible, keep track of the
|
|
+ * branch with the best min_deadline
|
|
*/
|
|
+ if (node->rb_left) {
|
|
+ struct sched_entity *left = __node_2_se(node->rb_left);
|
|
+
|
|
+ if (!best_left || deadline_gt(min_deadline, best_left, left))
|
|
+ best_left = left;
|
|
+
|
|
+ /*
|
|
+ * min_deadline is in the left branch. rb_left and all
|
|
+ * descendants are eligible, so immediately switch to the second
|
|
+ * loop.
|
|
+ */
|
|
+ if (left->min_deadline == se->min_deadline)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* min_deadline is at this node, no need to look right */
|
|
+ if (se->deadline == se->min_deadline)
|
|
+ break;
|
|
+
|
|
+ /* else min_deadline is in the right branch. */
|
|
+ node = node->rb_right;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * We ran into an eligible node which is itself the best.
|
|
+ * (Or nr_running == 0 and both are NULL)
|
|
+ */
|
|
+ if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
|
|
+ return best;
|
|
+
|
|
+ /*
|
|
+ * Now best_left and all of its children are eligible, and we are just
|
|
+ * looking for deadline == min_deadline
|
|
+ */
|
|
+ node = &best_left->run_node;
|
|
+ while (node) {
|
|
+ struct sched_entity *se = __node_2_se(node);
|
|
+
|
|
+ /* min_deadline is the current node */
|
|
+ if (se->deadline == se->min_deadline)
|
|
+ return se;
|
|
+
|
|
+ /* min_deadline is in the left branch */
|
|
if (node->rb_left &&
|
|
__node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
|
|
node = node->rb_left;
|
|
continue;
|
|
}
|
|
|
|
+ /* else min_deadline is in the right branch */
|
|
node = node->rb_right;
|
|
}
|
|
+ return NULL;
|
|
+}
|
|
|
|
- if (!best || (curr && deadline_gt(deadline, best, curr)))
|
|
- best = curr;
|
|
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
|
|
+{
|
|
+ struct sched_entity *se = __pick_eevdf(cfs_rq);
|
|
|
|
- if (unlikely(!best)) {
|
|
+ if (!se) {
|
|
struct sched_entity *left = __pick_first_entity(cfs_rq);
|
|
if (left) {
|
|
pr_err("EEVDF scheduling fail, picking leftmost\n");
|
|
@@ -934,7 +978,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
|
|
}
|
|
}
|
|
|
|
- return best;
|
|
+ return se;
|
|
}
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From d872c56e45bbe2bb687568cf2f15f6819916b565 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Tue, 17 Oct 2023 16:59:47 +0200
|
|
Subject: [PATCH 17/28] sched/eevdf: Fix heap corruption more
|
|
|
|
Because someone is a flaming idiot... :/
|
|
|
|
Fixes: 8dafa9d0eb1a ("sched/eevdf: Fix min_deadline heap integrity")
|
|
Reported-by: 0599jiangyc@gmail.com
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218020
|
|
---
|
|
kernel/sched/fair.c | 3 ++-
|
|
1 file changed, 2 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index caec9b43c..d0d912960 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -3656,7 +3656,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
*/
|
|
deadline = div_s64(deadline * old_weight, weight);
|
|
se->deadline = se->vruntime + deadline;
|
|
- min_deadline_cb_propagate(&se->run_node, NULL);
|
|
+ if (se != cfs_rq->curr)
|
|
+ min_deadline_cb_propagate(&se->run_node, NULL);
|
|
}
|
|
|
|
#ifdef CONFIG_SMP
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From b33c87a2d228bfc3f7950a681268b615757efd8a Mon Sep 17 00:00:00 2001
|
|
From: Ingo Molnar <mingo@kernel.org>
|
|
Date: Tue, 19 Sep 2023 10:31:15 +0200
|
|
Subject: [PATCH 18/28] sched/fair: Rename check_preempt_wakeup() to
|
|
check_preempt_wakeup_fair()
|
|
|
|
Other scheduling classes already postfix their similar methods
|
|
with the class name.
|
|
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
---
|
|
kernel/sched/fair.c | 4 ++--
|
|
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index d0d912960..89774e7e2 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -7966,7 +7966,7 @@ static void set_next_buddy(struct sched_entity *se)
|
|
/*
|
|
* Preempt the current task with a newly woken task if needed:
|
|
*/
|
|
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
|
|
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
|
|
{
|
|
struct task_struct *curr = rq->curr;
|
|
struct sched_entity *se = &curr->se, *pse = &p->se;
|
|
@@ -12680,7 +12680,7 @@ DEFINE_SCHED_CLASS(fair) = {
|
|
.yield_task = yield_task_fair,
|
|
.yield_to_task = yield_to_task_fair,
|
|
|
|
- .check_preempt_curr = check_preempt_wakeup,
|
|
+ .check_preempt_curr = check_preempt_wakeup_fair,
|
|
|
|
.pick_next_task = __pick_next_task_fair,
|
|
.put_prev_task = put_prev_task_fair,
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 2c79e453add0658cd6750eb81fbc241816c31b84 Mon Sep 17 00:00:00 2001
|
|
From: Ingo Molnar <mingo@kernel.org>
|
|
Date: Tue, 19 Sep 2023 10:38:21 +0200
|
|
Subject: [PATCH 19/28] sched/fair: Rename check_preempt_curr() to
|
|
wakeup_preempt()
|
|
|
|
The name is a bit opaque - make it clear that this is about wakeup
|
|
preemption.
|
|
|
|
Also rename the ->check_preempt_curr() methods similarly.
|
|
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
---
|
|
kernel/sched/core.c | 14 +++++++-------
|
|
kernel/sched/deadline.c | 10 +++++-----
|
|
kernel/sched/fair.c | 10 +++++-----
|
|
kernel/sched/idle.c | 4 ++--
|
|
kernel/sched/rt.c | 6 +++---
|
|
kernel/sched/sched.h | 4 ++--
|
|
kernel/sched/stop_task.c | 4 ++--
|
|
7 files changed, 26 insertions(+), 26 deletions(-)
|
|
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index be77d999d..4d851de8e 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -2198,10 +2198,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
|
|
p->sched_class->prio_changed(rq, p, oldprio);
|
|
}
|
|
|
|
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
|
|
+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
if (p->sched_class == rq->curr->sched_class)
|
|
- rq->curr->sched_class->check_preempt_curr(rq, p, flags);
|
|
+ rq->curr->sched_class->wakeup_preempt(rq, p, flags);
|
|
else if (sched_class_above(p->sched_class, rq->curr->sched_class))
|
|
resched_curr(rq);
|
|
|
|
@@ -2507,7 +2507,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
|
|
rq_lock(rq, rf);
|
|
WARN_ON_ONCE(task_cpu(p) != new_cpu);
|
|
activate_task(rq, p, 0);
|
|
- check_preempt_curr(rq, p, 0);
|
|
+ wakeup_preempt(rq, p, 0);
|
|
|
|
return rq;
|
|
}
|
|
@@ -3389,7 +3389,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
|
|
deactivate_task(src_rq, p, 0);
|
|
set_task_cpu(p, cpu);
|
|
activate_task(dst_rq, p, 0);
|
|
- check_preempt_curr(dst_rq, p, 0);
|
|
+ wakeup_preempt(dst_rq, p, 0);
|
|
|
|
rq_unpin_lock(dst_rq, &drf);
|
|
rq_unpin_lock(src_rq, &srf);
|
|
@@ -3774,7 +3774,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
|
|
}
|
|
|
|
activate_task(rq, p, en_flags);
|
|
- check_preempt_curr(rq, p, wake_flags);
|
|
+ wakeup_preempt(rq, p, wake_flags);
|
|
|
|
ttwu_do_wakeup(p);
|
|
|
|
@@ -3845,7 +3845,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
|
|
* it should preempt the task that is current now.
|
|
*/
|
|
update_rq_clock(rq);
|
|
- check_preempt_curr(rq, p, wake_flags);
|
|
+ wakeup_preempt(rq, p, wake_flags);
|
|
}
|
|
ttwu_do_wakeup(p);
|
|
ret = 1;
|
|
@@ -4872,7 +4872,7 @@ void wake_up_new_task(struct task_struct *p)
|
|
|
|
activate_task(rq, p, ENQUEUE_NOCLOCK);
|
|
trace_sched_wakeup_new(p);
|
|
- check_preempt_curr(rq, p, WF_FORK);
|
|
+ wakeup_preempt(rq, p, WF_FORK);
|
|
#ifdef CONFIG_SMP
|
|
if (p->sched_class->task_woken) {
|
|
/*
|
|
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
|
|
index 58b542bf2..fb1996a67 100644
|
|
--- a/kernel/sched/deadline.c
|
|
+++ b/kernel/sched/deadline.c
|
|
@@ -763,7 +763,7 @@ static inline void deadline_queue_pull_task(struct rq *rq)
|
|
|
|
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
|
|
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
|
|
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
|
|
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
|
|
|
|
static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
|
|
struct rq *rq)
|
|
@@ -1175,7 +1175,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
|
|
|
|
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
|
|
if (dl_task(rq->curr))
|
|
- check_preempt_curr_dl(rq, p, 0);
|
|
+ wakeup_preempt_dl(rq, p, 0);
|
|
else
|
|
resched_curr(rq);
|
|
|
|
@@ -1939,7 +1939,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
|
|
* Only called when both the current and waking task are -deadline
|
|
* tasks.
|
|
*/
|
|
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
|
|
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
|
|
int flags)
|
|
{
|
|
if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
|
|
@@ -2652,7 +2652,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
|
|
deadline_queue_push_tasks(rq);
|
|
#endif
|
|
if (dl_task(rq->curr))
|
|
- check_preempt_curr_dl(rq, p, 0);
|
|
+ wakeup_preempt_dl(rq, p, 0);
|
|
else
|
|
resched_curr(rq);
|
|
} else {
|
|
@@ -2721,7 +2721,7 @@ DEFINE_SCHED_CLASS(dl) = {
|
|
.dequeue_task = dequeue_task_dl,
|
|
.yield_task = yield_task_dl,
|
|
|
|
- .check_preempt_curr = check_preempt_curr_dl,
|
|
+ .wakeup_preempt = wakeup_preempt_dl,
|
|
|
|
.pick_next_task = pick_next_task_dl,
|
|
.put_prev_task = put_prev_task_dl,
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 89774e7e2..ab95f1312 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -7979,7 +7979,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
|
|
|
|
/*
|
|
* This is possible from callers such as attach_tasks(), in which we
|
|
- * unconditionally check_preempt_curr() after an enqueue (which may have
|
|
+ * unconditionally wakeup_preempt() after an enqueue (which may have
|
|
* lead to a throttle). This both saves work and prevents false
|
|
* next-buddy nomination below.
|
|
*/
|
|
@@ -8880,7 +8880,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
|
|
|
|
WARN_ON_ONCE(task_rq(p) != rq);
|
|
activate_task(rq, p, ENQUEUE_NOCLOCK);
|
|
- check_preempt_curr(rq, p, 0);
|
|
+ wakeup_preempt(rq, p, 0);
|
|
}
|
|
|
|
/*
|
|
@@ -12219,7 +12219,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
|
if (p->prio > oldprio)
|
|
resched_curr(rq);
|
|
} else
|
|
- check_preempt_curr(rq, p, 0);
|
|
+ wakeup_preempt(rq, p, 0);
|
|
}
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
@@ -12321,7 +12321,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
|
|
if (task_current(rq, p))
|
|
resched_curr(rq);
|
|
else
|
|
- check_preempt_curr(rq, p, 0);
|
|
+ wakeup_preempt(rq, p, 0);
|
|
}
|
|
}
|
|
|
|
@@ -12680,7 +12680,7 @@ DEFINE_SCHED_CLASS(fair) = {
|
|
.yield_task = yield_task_fair,
|
|
.yield_to_task = yield_to_task_fair,
|
|
|
|
- .check_preempt_curr = check_preempt_wakeup_fair,
|
|
+ .wakeup_preempt = check_preempt_wakeup_fair,
|
|
|
|
.pick_next_task = __pick_next_task_fair,
|
|
.put_prev_task = put_prev_task_fair,
|
|
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
|
|
index 5007b25c5..565f8374d 100644
|
|
--- a/kernel/sched/idle.c
|
|
+++ b/kernel/sched/idle.c
|
|
@@ -401,7 +401,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|
/*
|
|
* Idle tasks are unconditionally rescheduled:
|
|
*/
|
|
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
|
|
+static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
resched_curr(rq);
|
|
}
|
|
@@ -482,7 +482,7 @@ DEFINE_SCHED_CLASS(idle) = {
|
|
/* dequeue is not valid, we print a debug message there: */
|
|
.dequeue_task = dequeue_task_idle,
|
|
|
|
- .check_preempt_curr = check_preempt_curr_idle,
|
|
+ .wakeup_preempt = wakeup_preempt_idle,
|
|
|
|
.pick_next_task = pick_next_task_idle,
|
|
.put_prev_task = put_prev_task_idle,
|
|
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
|
|
index 185d3d749..6f066d5e0 100644
|
|
--- a/kernel/sched/rt.c
|
|
+++ b/kernel/sched/rt.c
|
|
@@ -953,7 +953,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
|
|
|
|
/*
|
|
* When we're idle and a woken (rt) task is
|
|
- * throttled check_preempt_curr() will set
|
|
+ * throttled wakeup_preempt() will set
|
|
* skip_update and the time between the wakeup
|
|
* and this unthrottle will get accounted as
|
|
* 'runtime'.
|
|
@@ -1715,7 +1715,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
|
|
/*
|
|
* Preempt the current task with a newly woken task if needed:
|
|
*/
|
|
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
|
|
+static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
if (p->prio < rq->curr->prio) {
|
|
resched_curr(rq);
|
|
@@ -2702,7 +2702,7 @@ DEFINE_SCHED_CLASS(rt) = {
|
|
.dequeue_task = dequeue_task_rt,
|
|
.yield_task = yield_task_rt,
|
|
|
|
- .check_preempt_curr = check_preempt_curr_rt,
|
|
+ .wakeup_preempt = wakeup_preempt_rt,
|
|
|
|
.pick_next_task = pick_next_task_rt,
|
|
.put_prev_task = put_prev_task_rt,
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index 576d371c8..26c235d3a 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -2217,7 +2217,7 @@ struct sched_class {
|
|
void (*yield_task) (struct rq *rq);
|
|
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
|
|
|
|
- void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
|
|
+ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
|
|
|
|
struct task_struct *(*pick_next_task)(struct rq *rq);
|
|
|
|
@@ -2490,7 +2490,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
|
|
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
|
|
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
|
|
|
|
-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
|
|
+extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
|
|
|
|
#ifdef CONFIG_PREEMPT_RT
|
|
#define SCHED_NR_MIGRATE_BREAK 8
|
|
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
|
|
index 85590599b..6cf7304e6 100644
|
|
--- a/kernel/sched/stop_task.c
|
|
+++ b/kernel/sched/stop_task.c
|
|
@@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|
#endif /* CONFIG_SMP */
|
|
|
|
static void
|
|
-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
|
|
+wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
/* we're never preempted */
|
|
}
|
|
@@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = {
|
|
.dequeue_task = dequeue_task_stop,
|
|
.yield_task = yield_task_stop,
|
|
|
|
- .check_preempt_curr = check_preempt_curr_stop,
|
|
+ .wakeup_preempt = wakeup_preempt_stop,
|
|
|
|
.pick_next_task = pick_next_task_stop,
|
|
.put_prev_task = put_prev_task_stop,
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From a4c874c05e95ba4e22151f5ac13074b8c1d861e8 Mon Sep 17 00:00:00 2001
|
|
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
|
Date: Wed, 20 Sep 2023 15:00:24 +0200
|
|
Subject: [PATCH 20/28] sched/debug: Remove the
|
|
/proc/sys/kernel/sched_child_runs_first sysctl
|
|
|
|
The /proc/sys/kernel/sched_child_runs_first knob is no longer connected since:
|
|
|
|
5e963f2bd4654 ("sched/fair: Commit to EEVDF")
|
|
|
|
Remove it.
|
|
|
|
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20230920130025.412071-2-bigeasy@linutronix.de
|
|
---
|
|
kernel/sched/debug.c | 1 -
|
|
kernel/sched/fair.c | 13 -------------
|
|
kernel/sched/sched.h | 2 --
|
|
3 files changed, 16 deletions(-)
|
|
|
|
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
|
index 2c5bb64f5..003fe3fb4 100644
|
|
--- a/kernel/sched/debug.c
|
|
+++ b/kernel/sched/debug.c
|
|
@@ -863,7 +863,6 @@ static void sched_debug_header(struct seq_file *m)
|
|
#define PN(x) \
|
|
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
|
|
PN(sysctl_sched_base_slice);
|
|
- P(sysctl_sched_child_runs_first);
|
|
P(sysctl_sched_features);
|
|
#undef PN
|
|
#undef P
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index ab95f1312..23d769b9c 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -78,12 +78,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
|
|
unsigned int sysctl_sched_base_slice = 750000ULL;
|
|
static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
|
|
|
|
-/*
|
|
- * After fork, child runs first. If set to 0 (default) then
|
|
- * parent will (try to) run first.
|
|
- */
|
|
-unsigned int sysctl_sched_child_runs_first __read_mostly;
|
|
-
|
|
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
|
|
|
|
int sched_thermal_decay_shift;
|
|
@@ -145,13 +139,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
static struct ctl_table sched_fair_sysctls[] = {
|
|
- {
|
|
- .procname = "sched_child_runs_first",
|
|
- .data = &sysctl_sched_child_runs_first,
|
|
- .maxlen = sizeof(unsigned int),
|
|
- .mode = 0644,
|
|
- .proc_handler = proc_dointvec,
|
|
- },
|
|
#ifdef CONFIG_CFS_BANDWIDTH
|
|
{
|
|
.procname = "sched_cfs_bandwidth_slice_us",
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index 26c235d3a..ab53f7eca 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -109,8 +109,6 @@ extern __read_mostly int scheduler_running;
|
|
extern unsigned long calc_load_update;
|
|
extern atomic_long_t calc_load_tasks;
|
|
|
|
-extern unsigned int sysctl_sched_child_runs_first;
|
|
-
|
|
extern void calc_global_load_tick(struct rq *this_rq);
|
|
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
|
|
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 01a0126a1b54a0984126857645838cb03adac52e Mon Sep 17 00:00:00 2001
|
|
From: Yiwei Lin <s921975628@gmail.com>
|
|
Date: Fri, 20 Oct 2023 13:56:17 +0800
|
|
Subject: [PATCH 21/28] sched/fair: Remove unused 'curr' argument from
|
|
pick_next_entity()
|
|
|
|
The 'curr' argument of pick_next_entity() has become unused after
|
|
the EEVDF changes.
|
|
|
|
[ mingo: Updated the changelog. ]
|
|
|
|
Signed-off-by: Yiwei Lin <s921975628@gmail.com>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Link: https://lore.kernel.org/r/20231020055617.42064-1-s921975628@gmail.com
|
|
---
|
|
kernel/sched/fair.c | 8 ++++----
|
|
1 file changed, 4 insertions(+), 4 deletions(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 23d769b9c..fe6c762d5 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -5242,7 +5242,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
* 4) do not run the "skip" process, if something else is available
|
|
*/
|
|
static struct sched_entity *
|
|
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
+pick_next_entity(struct cfs_rq *cfs_rq)
|
|
{
|
|
/*
|
|
* Enabling NEXT_BUDDY will affect latency but not fairness.
|
|
@@ -8058,7 +8058,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
|
|
goto again;
|
|
}
|
|
|
|
- se = pick_next_entity(cfs_rq, curr);
|
|
+ se = pick_next_entity(cfs_rq);
|
|
cfs_rq = group_cfs_rq(se);
|
|
} while (cfs_rq);
|
|
|
|
@@ -8121,7 +8121,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
|
|
}
|
|
}
|
|
|
|
- se = pick_next_entity(cfs_rq, curr);
|
|
+ se = pick_next_entity(cfs_rq);
|
|
cfs_rq = group_cfs_rq(se);
|
|
} while (cfs_rq);
|
|
|
|
@@ -8160,7 +8160,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
|
|
put_prev_task(rq, prev);
|
|
|
|
do {
|
|
- se = pick_next_entity(cfs_rq, NULL);
|
|
+ se = pick_next_entity(cfs_rq);
|
|
set_next_entity(cfs_rq, se);
|
|
cfs_rq = group_cfs_rq(se);
|
|
} while (cfs_rq);
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 0ba49853099d2af9d4bb8e2c945f8dc6daa81216 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Sat, 14 Oct 2023 23:12:20 +0200
|
|
Subject: [PATCH 22/28] sched/eevdf: Add feature comments
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
---
|
|
kernel/sched/features.h | 7 +++++++
|
|
1 file changed, 7 insertions(+)
|
|
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index 546d212ef..46b65fdc6 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -5,7 +5,14 @@
|
|
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
|
|
*/
|
|
SCHED_FEAT(PLACE_LAG, true)
|
|
+/*
|
|
+ * Give new tasks half a slice to ease into the competition.
|
|
+ */
|
|
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
|
|
+/*
|
|
+ * Inhibit (wakeup) preemption until the current task has either matched the
|
|
+ * 0-lag point or until is has exhausted it's slice.
|
|
+ */
|
|
SCHED_FEAT(RUN_TO_PARITY, true)
|
|
|
|
/*
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 8db4bc9bec401af5bf78f071471728dac9420171 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 4 Oct 2023 12:43:53 +0200
|
|
Subject: [PATCH 23/28] sched/eevdf: Remove min_vruntime_copy
|
|
|
|
Since commit e8f331bcc270 ("sched/smp: Use lag to simplify
|
|
cross-runqueue placement") the min_vruntime_copy is no longer used.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
---
|
|
kernel/sched/fair.c | 5 ++---
|
|
kernel/sched/sched.h | 4 ----
|
|
2 files changed, 2 insertions(+), 7 deletions(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index fe6c762d5..fbf907804 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -774,8 +774,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
|
|
}
|
|
|
|
/* ensure we never gain time by being placed backwards. */
|
|
- u64_u32_store(cfs_rq->min_vruntime,
|
|
- __update_min_vruntime(cfs_rq, vruntime));
|
|
+ cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
|
|
}
|
|
|
|
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
|
|
@@ -12343,7 +12342,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
|
|
void init_cfs_rq(struct cfs_rq *cfs_rq)
|
|
{
|
|
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
|
|
- u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
|
|
+ cfs_rq->min_vruntime = (u64)(-(1LL << 20));
|
|
#ifdef CONFIG_SMP
|
|
raw_spin_lock_init(&cfs_rq->removed.lock);
|
|
#endif
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index ab53f7eca..0a1957994 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -556,10 +556,6 @@ struct cfs_rq {
|
|
u64 min_vruntime_fi;
|
|
#endif
|
|
|
|
-#ifndef CONFIG_64BIT
|
|
- u64 min_vruntime_copy;
|
|
-#endif
|
|
-
|
|
struct rb_root_cached tasks_timeline;
|
|
|
|
/*
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From b46653f0195e9b8f467671caa47973a9bf518ee6 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Mon, 22 May 2023 13:46:30 +0200
|
|
Subject: [PATCH 24/28] sched/eevdf: Use sched_attr::sched_runtime to set
|
|
request/slice suggestion
|
|
|
|
Allow applications to directly set a suggested request/slice length using
|
|
sched_attr::sched_runtime.
|
|
|
|
The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
|
|
which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
|
|
|
|
Applications should strive to use their periodic runtime at a high
|
|
confidence interval (95%+) as the target slice. Using a smaller slice
|
|
will introduce undue preemptions, while using a larger value will
|
|
increase latency.
|
|
|
|
For all the following examples assume a scheduling quantum of 8, and for
|
|
consistency all examples have W=4:
|
|
|
|
{A,B,C,D}(w=1,r=8):
|
|
|
|
ABCD...
|
|
+---+---+---+---
|
|
|
|
t=0, V=1.5 t=1, V=3.5
|
|
A |------< A |------<
|
|
B |------< B |------<
|
|
C |------< C |------<
|
|
D |------< D |------<
|
|
---+*------+-------+--- ---+--*----+-------+---
|
|
|
|
t=2, V=5.5 t=3, V=7.5
|
|
A |------< A |------<
|
|
B |------< B |------<
|
|
C |------< C |------<
|
|
D |------< D |------<
|
|
---+----*--+-------+--- ---+------*+-------+---
|
|
|
|
Note: 4 identical tasks in FIFO order
|
|
|
|
~~~
|
|
|
|
{A,B}(w=1,r=16) C(w=2,r=16)
|
|
|
|
AACCBBCC...
|
|
+---+---+---+---
|
|
|
|
t=0, V=1.25 t=2, V=5.25
|
|
A |--------------< A |--------------<
|
|
B |--------------< B |--------------<
|
|
C |------< C |------<
|
|
---+*------+-------+--- ---+----*--+-------+---
|
|
|
|
t=4, V=8.25 t=6, V=12.25
|
|
A |--------------< A |--------------<
|
|
B |--------------< B |--------------<
|
|
C |------< C |------<
|
|
---+-------*-------+--- ---+-------+---*---+---
|
|
|
|
Note: 1 heavy task -- because q=8, double r such that the deadline of the w=2
|
|
task doesn't go below q.
|
|
|
|
Note: observe the full schedule becomes: W*max(r_i/w_i) = 4*2q = 8q in length.
|
|
|
|
Note: the period of the heavy task is half the full period at:
|
|
W*(r_i/w_i) = 4*(2q/2) = 4q
|
|
|
|
~~~
|
|
|
|
{A,C,D}(w=1,r=16) B(w=1,r=8):
|
|
|
|
BAACCBDD...
|
|
+---+---+---+---
|
|
|
|
t=0, V=1.5 t=1, V=3.5
|
|
A |--------------< A |---------------<
|
|
B |------< B |------<
|
|
C |--------------< C |--------------<
|
|
D |--------------< D |--------------<
|
|
---+*------+-------+--- ---+--*----+-------+---
|
|
|
|
t=3, V=7.5 t=5, V=11.5
|
|
A |---------------< A |---------------<
|
|
B |------< B |------<
|
|
C |--------------< C |--------------<
|
|
D |--------------< D |--------------<
|
|
---+------*+-------+--- ---+-------+--*----+---
|
|
|
|
t=6, V=13.5
|
|
A |---------------<
|
|
B |------<
|
|
C |--------------<
|
|
D |--------------<
|
|
---+-------+----*--+---
|
|
|
|
Note: 1 short task -- again double r so that the deadline of the short task
|
|
won't be below q. Made B short because its not the leftmost task, but is
|
|
eligible with the 0,1,2,3 spread.
|
|
|
|
Note: like with the heavy task, the period of the short task observes:
|
|
W*(r_i/w_i) = 4*(1q/1) = 4q
|
|
|
|
~~~
|
|
|
|
A(w=1,r=16) B(w=1,r=8) C(w=2,r=16)
|
|
|
|
BCCAABCC...
|
|
+---+---+---+---
|
|
|
|
t=0, V=1.25 t=1, V=3.25
|
|
A |--------------< A |--------------<
|
|
B |------< B |------<
|
|
C |------< C |------<
|
|
---+*------+-------+--- ---+--*----+-------+---
|
|
|
|
t=3, V=7.25 t=5, V=11.25
|
|
A |--------------< A |--------------<
|
|
B |------< B |------<
|
|
C |------< C |------<
|
|
---+------*+-------+--- ---+-------+--*----+---
|
|
|
|
t=6, V=13.25
|
|
A |--------------<
|
|
B |------<
|
|
C |------<
|
|
---+-------+----*--+---
|
|
|
|
Note: 1 heavy and 1 short task -- combine them all.
|
|
|
|
Note: both the short and heavy task end up with a period of 4q
|
|
|
|
~~~
|
|
|
|
A(w=1,r=16) B(w=2,r=16) C(w=1,r=8)
|
|
|
|
BBCAABBC...
|
|
+---+---+---+---
|
|
|
|
t=0, V=1 t=2, V=5
|
|
A |--------------< A |--------------<
|
|
B |------< B |------<
|
|
C |------< C |------<
|
|
---+*------+-------+--- ---+----*--+-------+---
|
|
|
|
t=3, V=7 t=5, V=11
|
|
A |--------------< A |--------------<
|
|
B |------< B |------<
|
|
C |------< C |------<
|
|
---+------*+-------+--- ---+-------+--*----+---
|
|
|
|
t=7, V=15
|
|
A |--------------<
|
|
B |------<
|
|
C |------<
|
|
---+-------+------*+---
|
|
|
|
Note: as before but permuted
|
|
|
|
~~~
|
|
|
|
From all this it can be deduced that, for the steady state:
|
|
|
|
- the total period (P) of a schedule is: W*max(r_i/w_i)
|
|
- the average period of a task is: W*(r_i/w_i)
|
|
- each task obtains the fair share: w_i/W of each full period P
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
---
|
|
include/linux/sched.h | 3 +++
|
|
kernel/sched/core.c | 33 ++++++++++++++++++++++++++-------
|
|
kernel/sched/debug.c | 3 ++-
|
|
kernel/sched/fair.c | 6 ++++--
|
|
4 files changed, 35 insertions(+), 10 deletions(-)
|
|
|
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
|
index 35331c35f..e0a81ce05 100644
|
|
--- a/include/linux/sched.h
|
|
+++ b/include/linux/sched.h
|
|
@@ -555,6 +555,9 @@ struct sched_entity {
|
|
struct list_head group_node;
|
|
unsigned int on_rq;
|
|
|
|
+ unsigned int custom_slice : 1;
|
|
+ /* 31 bits hole */
|
|
+
|
|
u64 exec_start;
|
|
u64 sum_exec_runtime;
|
|
u64 prev_sum_exec_runtime;
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index 4d851de8e..6fcccd15a 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -4502,7 +4502,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|
p->se.nr_migrations = 0;
|
|
p->se.vruntime = 0;
|
|
p->se.vlag = 0;
|
|
- p->se.slice = sysctl_sched_base_slice;
|
|
INIT_LIST_HEAD(&p->se.group_node);
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
@@ -4756,6 +4755,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|
|
|
p->prio = p->normal_prio = p->static_prio;
|
|
set_load_weight(p, false);
|
|
+ p->se.custom_slice = 0;
|
|
+ p->se.slice = sysctl_sched_base_slice;
|
|
|
|
/*
|
|
* We don't need the reset flag anymore after the fork. It has
|
|
@@ -7527,10 +7528,20 @@ static void __setscheduler_params(struct task_struct *p,
|
|
|
|
p->policy = policy;
|
|
|
|
- if (dl_policy(policy))
|
|
+ if (dl_policy(policy)) {
|
|
__setparam_dl(p, attr);
|
|
- else if (fair_policy(policy))
|
|
+ } else if (fair_policy(policy)) {
|
|
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
|
|
+ if (attr->sched_runtime) {
|
|
+ p->se.custom_slice = 1;
|
|
+ p->se.slice = clamp_t(u64, attr->sched_runtime,
|
|
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
|
|
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
|
|
+ } else {
|
|
+ p->se.custom_slice = 0;
|
|
+ p->se.slice = sysctl_sched_base_slice;
|
|
+ }
|
|
+ }
|
|
|
|
/*
|
|
* __sched_setscheduler() ensures attr->sched_priority == 0 when
|
|
@@ -7715,7 +7726,9 @@ static int __sched_setscheduler(struct task_struct *p,
|
|
* but store a possible modification of reset_on_fork.
|
|
*/
|
|
if (unlikely(policy == p->policy)) {
|
|
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
|
|
+ if (fair_policy(policy) &&
|
|
+ (attr->sched_nice != task_nice(p) ||
|
|
+ (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
|
|
goto change;
|
|
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
|
|
goto change;
|
|
@@ -7861,6 +7874,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
|
|
.sched_nice = PRIO_TO_NICE(p->static_prio),
|
|
};
|
|
|
|
+ if (p->se.custom_slice)
|
|
+ attr.sched_runtime = p->se.slice;
|
|
+
|
|
/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
|
|
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
|
|
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
|
|
@@ -8037,12 +8053,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
|
|
|
|
static void get_params(struct task_struct *p, struct sched_attr *attr)
|
|
{
|
|
- if (task_has_dl_policy(p))
|
|
+ if (task_has_dl_policy(p)) {
|
|
__getparam_dl(p, attr);
|
|
- else if (task_has_rt_policy(p))
|
|
+ } else if (task_has_rt_policy(p)) {
|
|
attr->sched_priority = p->rt_priority;
|
|
- else
|
|
+ } else {
|
|
attr->sched_nice = task_nice(p);
|
|
+ attr->sched_runtime = p->se.slice;
|
|
+ }
|
|
}
|
|
|
|
/**
|
|
@@ -10061,6 +10079,7 @@ void __init sched_init(void)
|
|
}
|
|
|
|
set_load_weight(&init_task, false);
|
|
+ init_task.se.slice = sysctl_sched_base_slice,
|
|
|
|
/*
|
|
* The boot idle thread does lazy MMU switching as well:
|
|
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
|
index 003fe3fb4..37ffe69a9 100644
|
|
--- a/kernel/sched/debug.c
|
|
+++ b/kernel/sched/debug.c
|
|
@@ -578,11 +578,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
|
|
else
|
|
SEQ_printf(m, " %c", task_state_to_char(p));
|
|
|
|
- SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
|
|
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
|
|
p->comm, task_pid_nr(p),
|
|
SPLIT_NS(p->se.vruntime),
|
|
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
|
|
SPLIT_NS(p->se.deadline),
|
|
+ p->se.custom_slice ? 'S' : ' ',
|
|
SPLIT_NS(p->se.slice),
|
|
SPLIT_NS(p->se.sum_exec_runtime),
|
|
(long long)(p->nvcsw + p->nivcsw),
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index fbf907804..357005f0d 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -1011,7 +1011,8 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
* nice) while the request time r_i is determined by
|
|
* sysctl_sched_base_slice.
|
|
*/
|
|
- se->slice = sysctl_sched_base_slice;
|
|
+ if (!se->custom_slice)
|
|
+ se->slice = sysctl_sched_base_slice;
|
|
|
|
/*
|
|
* EEVDF: vd_i = ve_i + r_i / w_i
|
|
@@ -4961,7 +4962,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
u64 vslice, vruntime = avg_vruntime(cfs_rq);
|
|
s64 lag = 0;
|
|
|
|
- se->slice = sysctl_sched_base_slice;
|
|
+ if (!se->custom_slice)
|
|
+ se->slice = sysctl_sched_base_slice;
|
|
vslice = calc_delta_fair(se->slice, se);
|
|
|
|
/*
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 03a5b5f7a486c044e165ddec3253ebdbc0039031 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Tue, 26 Sep 2023 14:32:32 +0200
|
|
Subject: [PATCH 25/28] sched/eevdf: Allow shorter slices to wakeup-preempt
|
|
|
|
Part of the reason to have shorter slices is to improve
|
|
responsiveness. Allow shorter slices to preempt longer slices on
|
|
wakeup.
|
|
|
|
Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Sum delay ms |
|
|
|
|
100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT
|
|
|
|
1 massive_intr:(5) | 846018.956 ms | 779188 | avg: 0.273 ms | max: 58.337 ms | sum:212545.245 ms |
|
|
2 massive_intr:(5) | 853450.693 ms | 792269 | avg: 0.275 ms | max: 71.193 ms | sum:218263.588 ms |
|
|
3 massive_intr:(5) | 843888.920 ms | 771456 | avg: 0.277 ms | max: 92.405 ms | sum:213353.221 ms |
|
|
1 chromium-browse:(8) | 53015.889 ms | 131766 | avg: 0.463 ms | max: 36.341 ms | sum:60959.230 ms |
|
|
2 chromium-browse:(8) | 53864.088 ms | 136962 | avg: 0.480 ms | max: 27.091 ms | sum:65687.681 ms |
|
|
3 chromium-browse:(9) | 53637.904 ms | 132637 | avg: 0.481 ms | max: 24.756 ms | sum:63781.673 ms |
|
|
1 cyclictest:(5) | 12615.604 ms | 639689 | avg: 0.471 ms | max: 32.272 ms | sum:301351.094 ms |
|
|
2 cyclictest:(5) | 12511.583 ms | 642578 | avg: 0.448 ms | max: 44.243 ms | sum:287632.830 ms |
|
|
3 cyclictest:(5) | 12545.867 ms | 635953 | avg: 0.475 ms | max: 25.530 ms | sum:302374.658 ms |
|
|
|
|
100ms massive_intr 500us cyclictest PREEMPT_SHORT
|
|
|
|
1 massive_intr:(5) | 839843.919 ms | 837384 | avg: 0.264 ms | max: 74.366 ms | sum:221476.885 ms |
|
|
2 massive_intr:(5) | 852449.913 ms | 845086 | avg: 0.252 ms | max: 68.162 ms | sum:212595.968 ms |
|
|
3 massive_intr:(5) | 839180.725 ms | 836883 | avg: 0.266 ms | max: 69.742 ms | sum:222812.038 ms |
|
|
1 chromium-browse:(11) | 54591.481 ms | 138388 | avg: 0.458 ms | max: 35.427 ms | sum:63401.508 ms |
|
|
2 chromium-browse:(8) | 52034.541 ms | 132276 | avg: 0.436 ms | max: 31.826 ms | sum:57732.958 ms |
|
|
3 chromium-browse:(8) | 55231.771 ms | 141892 | avg: 0.469 ms | max: 27.607 ms | sum:66538.697 ms |
|
|
1 cyclictest:(5) | 13156.391 ms | 667412 | avg: 0.373 ms | max: 38.247 ms | sum:249174.502 ms |
|
|
2 cyclictest:(5) | 12688.939 ms | 665144 | avg: 0.374 ms | max: 33.548 ms | sum:248509.392 ms |
|
|
3 cyclictest:(5) | 13475.623 ms | 669110 | avg: 0.370 ms | max: 37.819 ms | sum:247673.390 ms |
|
|
|
|
As per the numbers the, this makes cyclictest (short slice) it's
|
|
max-delay more consistent and consistency drops the sum-delay. The
|
|
trade-off is that the massive_intr (long slice) gets more context
|
|
switches and a slight increase in sum-delay.
|
|
|
|
[mike: numbers]
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
|
|
---
|
|
kernel/sched/fair.c | 11 ++++++++---
|
|
kernel/sched/features.h | 4 ++++
|
|
2 files changed, 12 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 357005f0d..b16c70e3b 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -8022,9 +8022,14 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
|
|
cfs_rq = cfs_rq_of(se);
|
|
update_curr(cfs_rq);
|
|
|
|
- /*
|
|
- * XXX pick_eevdf(cfs_rq) != se ?
|
|
- */
|
|
+ if (sched_feat(PREEMPT_SHORT) && pse->slice < se->slice &&
|
|
+ entity_eligible(cfs_rq, pse) &&
|
|
+ (s64)(pse->deadline - se->deadline) < 0 &&
|
|
+ se->vlag == se->deadline) {
|
|
+ /* negate RUN_TO_PARITY */
|
|
+ se->vlag = se->deadline - 1;
|
|
+ }
|
|
+
|
|
if (pick_eevdf(cfs_rq) == pse)
|
|
goto preempt;
|
|
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index 46b65fdc6..642f1de58 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -14,6 +14,10 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
|
|
* 0-lag point or until is has exhausted it's slice.
|
|
*/
|
|
SCHED_FEAT(RUN_TO_PARITY, true)
|
|
+/*
|
|
+ * Allow tasks with a shorter slice to disregard RUN_TO_PARITY
|
|
+ */
|
|
+SCHED_FEAT(PREEMPT_SHORT, true)
|
|
|
|
/*
|
|
* Prefer to schedule the task we woke last (assuming it failed
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 3c04ff192a3569a6c0ee6924678bc598efc1af59 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Tue, 26 Sep 2023 14:39:41 +0200
|
|
Subject: [PATCH 26/28] sched/eevdf: Revenge of the Sith^WSleeper
|
|
|
|
For tasks that have received excess service (negative lag) allow them
|
|
to gain parity (zero lag) by sleeping.
|
|
|
|
slice 30000000 (*10)
|
|
# Min Latencies: 00041
|
|
# Avg Latencies: 00712
|
|
# Max Latencies: 287353
|
|
|
|
slice 3000000 (default)
|
|
# Min Latencies: 00054
|
|
# Avg Latencies: 00436
|
|
# Max Latencies: 23531
|
|
|
|
slice 300000 (/10)
|
|
# Min Latencies: 00054
|
|
# Avg Latencies: 00061
|
|
# Max Latencies: 05245
|
|
|
|
It sucks for many other things though... so let it be an experiment.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
---
|
|
kernel/sched/fair.c | 36 ++++++++++++++++++++++++++++++++++++
|
|
kernel/sched/features.h | 6 ++++++
|
|
2 files changed, 42 insertions(+)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index b16c70e3b..10009c713 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -4956,6 +4956,33 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
+static inline u64
|
|
+entity_vlag_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
+{
|
|
+ u64 now, vdelta;
|
|
+ s64 delta;
|
|
+
|
|
+ if (!(flags & ENQUEUE_WAKEUP))
|
|
+ return se->vlag;
|
|
+
|
|
+ if (flags & ENQUEUE_MIGRATED)
|
|
+ return 0;
|
|
+
|
|
+ now = rq_clock_task(rq_of(cfs_rq));
|
|
+ delta = now - se->exec_start;
|
|
+ if (delta < 0)
|
|
+ return se->vlag;
|
|
+
|
|
+ if (sched_feat(GENTLE_SLEEPER))
|
|
+ delta /= 2;
|
|
+
|
|
+ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load);
|
|
+ if (vdelta < -se->vlag)
|
|
+ return se->vlag + vdelta;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static void
|
|
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
{
|
|
@@ -4980,6 +5007,15 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
|
lag = se->vlag;
|
|
|
|
+ /*
|
|
+ * Allow tasks that have received too much service (negative
|
|
+ * lag) to (re)gain parity (zero lag) by sleeping for the
|
|
+ * equivalent duration. This ensures they will be readily
|
|
+ * eligible.
|
|
+ */
|
|
+ if (sched_feat(PLACE_SLEEPER) && lag < 0)
|
|
+ lag = entity_vlag_sleeper(cfs_rq, se, flags);
|
|
+
|
|
/*
|
|
* If we want to place a task and preserve lag, we have to
|
|
* consider the effect of the new entity on the weighted
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index 642f1de58..97b5f6dd9 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -18,6 +18,12 @@ SCHED_FEAT(RUN_TO_PARITY, true)
|
|
* Allow tasks with a shorter slice to disregard RUN_TO_PARITY
|
|
*/
|
|
SCHED_FEAT(PREEMPT_SHORT, true)
|
|
+/*
|
|
+ * Let sleepers earn back lag, but not more than 0-lag. GENTLE_SLEEPERS earn at
|
|
+ * half the speed.
|
|
+ */
|
|
+SCHED_FEAT(PLACE_SLEEPER, false)
|
|
+SCHED_FEAT(GENTLE_SLEEPER, true)
|
|
|
|
/*
|
|
* Prefer to schedule the task we woke last (assuming it failed
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From 445a0b07989b3da3116f8f6bf7dd59806a79cc32 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Thu, 5 Oct 2023 15:30:13 +0200
|
|
Subject: [PATCH 27/28] sched/eevdf: Disable entity_eligible()
|
|
|
|
Disable entity_eligible() entirely, this makes tasks much easier to
|
|
pick, but also gives rise to degenerate cases like:
|
|
|
|
t=92 V=16
|
|
A |----<
|
|
B |<
|
|
>C |----------------<
|
|
D |<
|
|
E |<
|
|
F |<
|
|
G |<
|
|
|---------|-----*---|---------|---------|----
|
|
|
|
hence, default disable.
|
|
|
|
Suggested-by: Youssef Esmat <youssefesmat@chromium.org>
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
---
|
|
kernel/sched/fair.c | 3 +++
|
|
kernel/sched/features.h | 11 +++++++++++
|
|
2 files changed, 14 insertions(+)
|
|
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 10009c713..1bdd95677 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -724,6 +724,9 @@ void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
*/
|
|
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
+ if (sched_feat(EVDF))
|
|
+ return true;
|
|
+
|
|
struct sched_entity *curr = cfs_rq->curr;
|
|
s64 avg = cfs_rq->avg_vruntime;
|
|
long load = cfs_rq->avg_load;
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index 97b5f6dd9..dacef8e1b 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -24,6 +24,17 @@ SCHED_FEAT(PREEMPT_SHORT, true)
|
|
*/
|
|
SCHED_FEAT(PLACE_SLEEPER, false)
|
|
SCHED_FEAT(GENTLE_SLEEPER, true)
|
|
+/*
|
|
+ * Disable the eligibility check -- always true.
|
|
+ *
|
|
+ * Selecting this allows short tasks, in the presence of a long task, to walk
|
|
+ * far past 0-lag and create a window where newly placed tasks will come in and
|
|
+ * starve the long task.
|
|
+ *
|
|
+ * Behaves quite terrible for mixed slice workloads as a result, very much not
|
|
+ * recommended.
|
|
+ */
|
|
+SCHED_FEAT(EVDF, false)
|
|
|
|
/*
|
|
* Prefer to schedule the task we woke last (assuming it failed
|
|
--
|
|
2.42.0
|
|
|
|
|
|
From f6be5a98f494e65149331150991c7079a7f9338c Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Fri, 15 Sep 2023 00:48:45 +0200
|
|
Subject: [PATCH 28/28] sched/eevdf: Delay dequeue
|
|
|
|
For tasks that have negative-lag (have received 'excess' service), delay the
|
|
dequeue and keep them in the runnable tree until they're eligible again. Or
|
|
rather, keep them until they're selected again, since finding their eligibility
|
|
crossover point is expensive.
|
|
|
|
The effect is a bit like sleeper bonus, the tasks keep contending for service
|
|
until either they get a wakeup or until they're selected again and are really
|
|
dequeued.
|
|
|
|
This means that any actual dequeue happens with positive lag (serviced owed)
|
|
and are more readily ran when woken next.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
---
|
|
include/linux/sched.h | 1 +
|
|
kernel/sched/core.c | 88 +++++++++++++++++++++++++++++++++--------
|
|
kernel/sched/fair.c | 11 ++++++
|
|
kernel/sched/features.h | 11 ++++++
|
|
kernel/sched/sched.h | 3 +-
|
|
5 files changed, 97 insertions(+), 17 deletions(-)
|
|
|
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
|
index e0a81ce05..93c03b162 100644
|
|
--- a/include/linux/sched.h
|
|
+++ b/include/linux/sched.h
|
|
@@ -894,6 +894,7 @@ struct task_struct {
|
|
unsigned sched_reset_on_fork:1;
|
|
unsigned sched_contributes_to_load:1;
|
|
unsigned sched_migrated:1;
|
|
+ unsigned sched_delayed:1;
|
|
|
|
/* Force alignment to the next boundary: */
|
|
unsigned :0;
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index 6fcccd15a..9f56638b6 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -3839,12 +3839,23 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
|
|
|
|
rq = __task_rq_lock(p, &rf);
|
|
if (task_on_rq_queued(p)) {
|
|
+ update_rq_clock(rq);
|
|
+ if (unlikely(p->sched_delayed)) {
|
|
+ p->sched_delayed = 0;
|
|
+ /* mustn't run a delayed task */
|
|
+ WARN_ON_ONCE(task_on_cpu(rq, p));
|
|
+ if (sched_feat(GENTLE_DELAY)) {
|
|
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
|
|
+ if (p->se.vlag > 0)
|
|
+ p->se.vlag = 0;
|
|
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
|
|
+ }
|
|
+ }
|
|
if (!task_on_cpu(rq, p)) {
|
|
/*
|
|
* When on_rq && !on_cpu the task is preempted, see if
|
|
* it should preempt the task that is current now.
|
|
*/
|
|
- update_rq_clock(rq);
|
|
wakeup_preempt(rq, p, wake_flags);
|
|
}
|
|
ttwu_do_wakeup(p);
|
|
@@ -6552,6 +6563,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|
# define SM_MASK_PREEMPT SM_PREEMPT
|
|
#endif
|
|
|
|
+static void deschedule_task(struct rq *rq, struct task_struct *p, unsigned long prev_state)
|
|
+{
|
|
+ p->sched_contributes_to_load =
|
|
+ (prev_state & TASK_UNINTERRUPTIBLE) &&
|
|
+ !(prev_state & TASK_NOLOAD) &&
|
|
+ !(prev_state & TASK_FROZEN);
|
|
+
|
|
+ if (p->sched_contributes_to_load)
|
|
+ rq->nr_uninterruptible++;
|
|
+
|
|
+ deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
|
|
+
|
|
+ if (p->in_iowait) {
|
|
+ atomic_inc(&rq->nr_iowait);
|
|
+ delayacct_blkio_start();
|
|
+ }
|
|
+}
|
|
+
|
|
/*
|
|
* __schedule() is the main scheduler function.
|
|
*
|
|
@@ -6636,6 +6665,8 @@ static void __sched notrace __schedule(unsigned int sched_mode)
|
|
|
|
switch_count = &prev->nivcsw;
|
|
|
|
+ WARN_ON_ONCE(prev->sched_delayed);
|
|
+
|
|
/*
|
|
* We must load prev->state once (task_struct::state is volatile), such
|
|
* that we form a control dependency vs deactivate_task() below.
|
|
@@ -6645,14 +6676,6 @@ static void __sched notrace __schedule(unsigned int sched_mode)
|
|
if (signal_pending_state(prev_state, prev)) {
|
|
WRITE_ONCE(prev->__state, TASK_RUNNING);
|
|
} else {
|
|
- prev->sched_contributes_to_load =
|
|
- (prev_state & TASK_UNINTERRUPTIBLE) &&
|
|
- !(prev_state & TASK_NOLOAD) &&
|
|
- !(prev_state & TASK_FROZEN);
|
|
-
|
|
- if (prev->sched_contributes_to_load)
|
|
- rq->nr_uninterruptible++;
|
|
-
|
|
/*
|
|
* __schedule() ttwu()
|
|
* prev_state = prev->state; if (p->on_rq && ...)
|
|
@@ -6664,17 +6687,50 @@ static void __sched notrace __schedule(unsigned int sched_mode)
|
|
*
|
|
* After this, schedule() must not care about p->state any more.
|
|
*/
|
|
- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
|
|
-
|
|
- if (prev->in_iowait) {
|
|
- atomic_inc(&rq->nr_iowait);
|
|
- delayacct_blkio_start();
|
|
- }
|
|
+ if (sched_feat(DELAY_DEQUEUE) &&
|
|
+ prev->sched_class->delay_dequeue_task &&
|
|
+ prev->sched_class->delay_dequeue_task(rq, prev))
|
|
+ prev->sched_delayed = 1;
|
|
+ else
|
|
+ deschedule_task(rq, prev, prev_state);
|
|
}
|
|
switch_count = &prev->nvcsw;
|
|
}
|
|
|
|
- next = pick_next_task(rq, prev, &rf);
|
|
+ for (struct task_struct *tmp = prev;;) {
|
|
+ unsigned long tmp_state;
|
|
+
|
|
+ next = pick_next_task(rq, tmp, &rf);
|
|
+ if (unlikely(tmp != prev))
|
|
+ finish_task(tmp);
|
|
+
|
|
+ if (likely(!next->sched_delayed))
|
|
+ break;
|
|
+
|
|
+ next->sched_delayed = 0;
|
|
+
|
|
+ /*
|
|
+ * A sched_delayed task must not be runnable at this point, see
|
|
+ * ttwu_runnable().
|
|
+ */
|
|
+ tmp_state = READ_ONCE(next->__state);
|
|
+ if (WARN_ON_ONCE(!tmp_state))
|
|
+ break;
|
|
+
|
|
+ prepare_task(next);
|
|
+ /*
|
|
+ * Order ->on_cpu and ->on_rq, see the comments in
|
|
+ * try_to_wake_up(). Normally this is smp_mb__after_spinlock()
|
|
+ * above.
|
|
+ */
|
|
+ smp_wmb();
|
|
+ deschedule_task(rq, next, tmp_state);
|
|
+ if (sched_feat(GENTLE_DELAY) && next->se.vlag > 0)
|
|
+ next->se.vlag = 0;
|
|
+
|
|
+ tmp = next;
|
|
+ }
|
|
+
|
|
clear_tsk_need_resched(prev);
|
|
clear_preempt_need_resched();
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 1bdd95677..8c1d8bbe7 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -8260,6 +8260,16 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
|
|
return pick_next_task_fair(rq, NULL, NULL);
|
|
}
|
|
|
|
+static bool delay_dequeue_task_fair(struct rq *rq, struct task_struct *p)
|
|
+{
|
|
+ struct sched_entity *se = &p->se;
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
+
|
|
+ update_curr(cfs_rq);
|
|
+
|
|
+ return !entity_eligible(cfs_rq, se);
|
|
+}
|
|
+
|
|
/*
|
|
* Account for a descheduled task:
|
|
*/
|
|
@@ -12714,6 +12724,7 @@ DEFINE_SCHED_CLASS(fair) = {
|
|
|
|
.wakeup_preempt = check_preempt_wakeup_fair,
|
|
|
|
+ .delay_dequeue_task = delay_dequeue_task_fair,
|
|
.pick_next_task = __pick_next_task_fair,
|
|
.put_prev_task = put_prev_task_fair,
|
|
.set_next_task = set_next_task_fair,
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index dacef8e1b..fd2c963b7 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -35,6 +35,17 @@ SCHED_FEAT(GENTLE_SLEEPER, true)
|
|
* recommended.
|
|
*/
|
|
SCHED_FEAT(EVDF, false)
|
|
+/*
|
|
+ * Delay dequeueing tasks until they get selected or woken.
|
|
+ *
|
|
+ * By delaying the dequeue for non-eligible tasks, they remain in the
|
|
+ * competition and can burn off their negative lag. When they get selected
|
|
+ * they'll have positive lag by definition.
|
|
+ *
|
|
+ * GENTLE_DELAY clips the lag on dequeue (or wakeup) to 0.
|
|
+ */
|
|
+SCHED_FEAT(DELAY_DEQUEUE, true)
|
|
+SCHED_FEAT(GENTLE_DELAY, true)
|
|
|
|
/*
|
|
* Prefer to schedule the task we woke last (assuming it failed
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index 0a1957994..50bca9b72 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -2213,6 +2213,7 @@ struct sched_class {
|
|
|
|
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
|
|
|
|
+ bool (*delay_dequeue_task)(struct rq *rq, struct task_struct *p);
|
|
struct task_struct *(*pick_next_task)(struct rq *rq);
|
|
|
|
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
|
|
@@ -2266,7 +2267,7 @@ struct sched_class {
|
|
|
|
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
|
|
{
|
|
- WARN_ON_ONCE(rq->curr != prev);
|
|
+// WARN_ON_ONCE(rq->curr != prev);
|
|
prev->sched_class->put_prev_task(rq, prev);
|
|
}
|
|
|
|
--
|
|
2.42.0
|
|
|