Fixup (bore) EEVDF issue in #819 (#820)

Signed-off-by: Peter Jung <admin@ptr1337.dev>
2023-09-23 03:41:08 +02:00
parent 6aab0087df
commit a4c0ab6b9f
2 changed files with 1 additions and 516 deletions
--- a/linux-tkg-config/prepare
+++ b/linux-tkg-config/prepare
@@ -859,11 +859,7 @@ _tkg_srcprep() {

    if [ "${_cpusched}" = "bore-eevdf" ]; then
      _msg="Applying BORE-EEVDF patch"
-      if [ "$_kver" != "605" ]; then
-        curl "https://raw.githubusercontent.com/CachyOS/kernel-patches/master/${_basekernel}/sched/0001-bore-eevdf.patch" > "$srcdir"/0001-bore-eevdf.patch
-      else
-        curl "https://raw.githubusercontent.com/sirlucjan/kernel-patches/master/${_basekernel}/bore-eevdf-patches-v2-sep/0016-linux6.5-bore3.1.3.patch" > "$srcdir"/0001-bore-eevdf.patch
-      fi
+      curl "https://raw.githubusercontent.com/CachyOS/kernel-patches/master/${_basekernel}/sched/0001-bore-eevdf.patch" > "$srcdir"/0001-bore-eevdf.patch
      tkgpatch="$srcdir/0001-bore-eevdf.patch" && _tkg_patcher
    fi
  fi
--- a/linux-tkg-patches/6.5/0003-eevdf.patch
+++ b/linux-tkg-patches/6.5/0003-eevdf.patch
@@ -2896,514 +2896,3 @@ index 03db55504..f68919800 100644
 --
 2.42.0

-
-From edbc7fe6658db891c80f244dc397f4e0247f6f3d Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Fri, 15 Sep 2023 00:48:55 +0200
-Subject: [PATCH 13/15] sched/eevdf: Also update slice on placement
-
-Tasks that never consume their full slice would not update their slice value.
-This means that tasks that are spawned before the sysctl scaling keep their
-original (UP) slice length.
-
-Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
- kernel/sched/fair.c | 6 ++++--
- 1 file changed, 4 insertions(+), 2 deletions(-)
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 1cdc95725..efbcdc69c 100644
--- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -4918,10 +4918,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
- static void
- place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- {
-	u64 vslice = calc_delta_fair(se->slice, se);
-	u64 vruntime = avg_vruntime(cfs_rq);
-+	u64 vslice, vruntime = avg_vruntime(cfs_rq);
- 	s64 lag = 0;
-
-+	se->slice = sysctl_sched_base_slice;
-+	vslice = calc_delta_fair(se->slice, se);
-+
- 	/*
- 	 * Due to how V is constructed as the weighted average of entities,
- 	 * adding tasks with positive lag, or removing tasks with negative lag
--
-2.42.0
-
-
-From 0f1fadfb03ba9ba181e4631de8cd97ba765fae1d Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Fri, 15 Sep 2023 00:48:45 +0200
-Subject: [PATCH 14/15] sched/eevdf: Delay dequeue
-
-For tasks that have negative-lag (have received 'excess' service), delay the
-dequeue and keep them in the runnable tree until they're elegible again. Or
-rather, keep them until they're selected again, since finding their elegibility
-crossover point is expensive.
-
-The effect is a bit like sleeper bonus, the tasks keep contending for service
-until either they get a wakeup or until they're selected again and are really
-dequeued.
-
-This means that any actual dequeue happens with positive lag (serviced owed)
-and are more readily ran when wakeup.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
- include/linux/sched.h   |  1 +
- kernel/sched/core.c     | 41 +++++++++++++++++++++++++++++++++++------
- kernel/sched/fair.c     |  9 +++++++++
- kernel/sched/features.h |  1 +
- kernel/sched/sched.h    |  3 ++-
- 5 files changed, 48 insertions(+), 7 deletions(-)
-
-diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 35331c35f..d40d98313 100644
--- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -891,6 +891,7 @@ struct task_struct {
- 	unsigned			sched_reset_on_fork:1;
- 	unsigned			sched_contributes_to_load:1;
- 	unsigned			sched_migrated:1;
-+	unsigned			sched_delayed:1;
-
- 	/* Force alignment to the next boundary: */
- 	unsigned			:0;
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 8116ef56d..cfb0ffa69 100644
--- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -6551,6 +6551,16 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
- # define SM_MASK_PREEMPT	SM_PREEMPT
- #endif
-
-+static void __deschedule_task(struct rq *rq, struct task_struct *p)
-+{
-+	deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-+
-+	if (p->in_iowait) {
-+		atomic_inc(&rq->nr_iowait);
-+		delayacct_blkio_start();
-+	}
-+}
-+
- /*
-  * __schedule() is the main scheduler function.
-  *
-@@ -6663,17 +6673,36 @@ static void __sched notrace __schedule(unsigned int sched_mode)
- 			 *
- 			 * After this, schedule() must not care about p->state any more.
- 			 */
-			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-+			if (!(sched_feat(DELAY_DEQUEUE) &&
-+			      prev->sched_class->eligible_task &&
-+			      !prev->sched_class->eligible_task(rq, prev)))
-+				__deschedule_task(rq, prev);
-+			else
-+				prev->sched_delayed = 1;
-+		}
-+		switch_count = &prev->nvcsw;
-+	}
-+
-+	for (struct task_struct *tmp = prev;;) {
-
-			if (prev->in_iowait) {
-				atomic_inc(&rq->nr_iowait);
-				delayacct_blkio_start();
-+		next = pick_next_task(rq, tmp, &rf);
-+		if (unlikely(tmp != prev))
-+			finish_task(tmp);
-+
-+		if (sched_feat(DELAY_DEQUEUE) && unlikely(next->sched_delayed)) {
-+			next->sched_delayed = 0;
-+			if (READ_ONCE(next->__state)) {
-+				prepare_task(next);
-+				smp_wmb();
-+				__deschedule_task(rq, next);
-+				tmp = next;
-+				continue;
- 			}
- 		}
-		switch_count = &prev->nvcsw;
-+
-+		break;
- 	}
-
-	next = pick_next_task(rq, prev, &rf);
- 	clear_tsk_need_resched(prev);
- 	clear_preempt_need_resched();
- #ifdef CONFIG_SCHED_DEBUG
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index efbcdc69c..729507e40 100644
--- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -8174,6 +8174,14 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
- 	return pick_next_task_fair(rq, NULL, NULL);
- }
-
-+static bool eligible_task_fair(struct rq *rq, struct task_struct *p)
-+{
-+	struct sched_entity *se = &p->se;
-+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-+
-+	return entity_eligible(cfs_rq, se);
-+}
-+
- /*
-  * Account for a descheduled task:
-  */
-@@ -12628,6 +12636,7 @@ DEFINE_SCHED_CLASS(fair) = {
-
- 	.check_preempt_curr	= check_preempt_wakeup,
-
-+	.eligible_task		= eligible_task_fair,
- 	.pick_next_task		= __pick_next_task_fair,
- 	.put_prev_task		= put_prev_task_fair,
- 	.set_next_task          = set_next_task_fair,
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index 546d212ef..5ae5a6f92 100644
--- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
-@@ -7,6 +7,7 @@
- SCHED_FEAT(PLACE_LAG, true)
- SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
- SCHED_FEAT(RUN_TO_PARITY, true)
-+SCHED_FEAT(DELAY_DEQUEUE, true)
-
- /*
-  * Prefer to schedule the task we woke last (assuming it failed
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 576d371c8..c18ab7c2f 100644
--- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -2219,6 +2219,7 @@ struct sched_class {
-
- 	void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
-
-+	bool (*eligible_task)(struct rq *rq, struct task_struct *p);
- 	struct task_struct *(*pick_next_task)(struct rq *rq);
-
- 	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
-@@ -2272,7 +2273,7 @@ struct sched_class {
-
- static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
- {
-	WARN_ON_ONCE(rq->curr != prev);
-+//	WARN_ON_ONCE(rq->curr != prev);
- 	prev->sched_class->put_prev_task(rq, prev);
- }
-
--
-2.42.0
-
-
-From 4aba3e1c3bbe4a36d4b9e405be8a66d7c10d6495 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Mon, 22 May 2023 13:46:30 +0200
-Subject: [PATCH 15/15] sched/eevdf: Use sched_attr::sched_runtime to set
- request/slice suggestion
-
-Allow applications to directly set a suggested request/slice length using
-sched_attr::sched_runtime.
-
-The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
-which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
-
-Applications should strive to use their periodic runtime at a high
-confidence interval (95%+) as the target slice. Using a smaller slice
-will introduce undue preemptions, while using a larger value will
-increase latency.
-
-For all the following examples assume a scheduling quantum of 8, and for
-consistency all examples have W=4:
-
-  {A,B,C,D}(w=1,r=8):
-
-  ABCD...
-  +---+---+---+---
-
-  t=0, V=1.5				t=1, V=3.5
-  A  |------<				A          |------<
-  B   |------<				B   |------<
-  C    |------<				C    |------<
-  D     |------<			D     |------<
-  ---+*------+-------+---		---+--*----+-------+---
-
-  t=2, V=5.5				t=3, V=7.5
-  A          |------<			A          |------<
-  B           |------<			B           |------<
-  C    |------<				C            |------<
-  D     |------<			D     |------<
-  ---+----*--+-------+---		---+------*+-------+---
-
-Note: 4 identical tasks in FIFO order
-
-~~~
-
-  {A,B}(w=1,r=16) C(w=2,r=16)
-
-  AACCBBCC...
-  +---+---+---+---
-
-  t=0, V=1.25				t=2, V=5.25
-  A  |--------------<                   A                  |--------------<
-  B   |--------------<                  B   |--------------<
-  C    |------<                         C    |------<
-  ---+*------+-------+---               ---+----*--+-------+---
-
-  t=4, V=8.25				t=6, V=12.25
-  A                  |--------------<   A                  |--------------<
-  B   |--------------<                  B                   |--------------<
-  C            |------<                 C            |------<
-  ---+-------*-------+---               ---+-------+---*---+---
-
-Note: 1 heavy task -- because q=8, double r such that the deadline of the w=2
-      task doesn't go below q.
-
-Note: observe the full schedule becomes: W*max(r_i/w_i) = 4*2q = 8q in length.
-
-Note: the period of the heavy task is half the full period at:
-      W*(r_i/w_i) = 4*(2q/2) = 4q
-
-~~~
-
-  {A,C,D}(w=1,r=16) B(w=1,r=8):
-
-  BAACCBDD...
-  +---+---+---+---
-
-  t=0, V=1.5				t=1, V=3.5
-  A  |--------------<			A  |---------------<
-  B   |------<				B           |------<
-  C    |--------------<			C    |--------------<
-  D     |--------------<		D     |--------------<
-  ---+*------+-------+---		---+--*----+-------+---
-
-  t=3, V=7.5				t=5, V=11.5
-  A                  |---------------<  A                  |---------------<
-  B           |------<                  B           |------<
-  C    |--------------<                 C                    |--------------<
-  D     |--------------<                D     |--------------<
-  ---+------*+-------+---               ---+-------+--*----+---
-
-  t=6, V=13.5
-  A                  |---------------<
-  B                   |------<
-  C                    |--------------<
-  D     |--------------<
-  ---+-------+----*--+---
-
-Note: 1 short task -- again double r so that the deadline of the short task
-      won't be below q. Made B short because its not the leftmost task, but is
-      eligible with the 0,1,2,3 spread.
-
-Note: like with the heavy task, the period of the short task observes:
-      W*(r_i/w_i) = 4*(1q/1) = 4q
-
-~~~
-
-  A(w=1,r=16) B(w=1,r=8) C(w=2,r=16)
-
-  BCCAABCC...
-  +---+---+---+---
-
-  t=0, V=1.25				t=1, V=3.25
-  A  |--------------<                   A  |--------------<
-  B   |------<                          B           |------<
-  C    |------<                         C    |------<
-  ---+*------+-------+---               ---+--*----+-------+---
-
-  t=3, V=7.25				t=5, V=11.25
-  A  |--------------<                   A                  |--------------<
-  B           |------<                  B           |------<
-  C            |------<                 C            |------<
-  ---+------*+-------+---               ---+-------+--*----+---
-
-  t=6, V=13.25
-  A                  |--------------<
-  B                   |------<
-  C            |------<
-  ---+-------+----*--+---
-
-Note: 1 heavy and 1 short task -- combine them all.
-
-Note: both the short and heavy task end up with a period of 4q
-
-~~~
-
-  A(w=1,r=16) B(w=2,r=16) C(w=1,r=8)
-
-  BBCAABBC...
-  +---+---+---+---
-
-  t=0, V=1				t=2, V=5
-  A  |--------------<                   A  |--------------<
-  B   |------<                          B           |------<
-  C    |------<                         C    |------<
-  ---+*------+-------+---               ---+----*--+-------+---
-
-  t=3, V=7				t=5, V=11
-  A  |--------------<                   A                  |--------------<
-  B           |------<                  B           |------<
-  C            |------<                 C            |------<
-  ---+------*+-------+---               ---+-------+--*----+---
-
-  t=7, V=15
-  A                  |--------------<
-  B                   |------<
-  C            |------<
-  ---+-------+------*+---
-
-Note: as before but permuted
-
-~~~
-
-From all this it can be deduced that, for the steady state:
-
- - the total period (P) of a schedule is:	W*max(r_i/w_i)
- - the average period of a task is:		W*(r_i/w_i)
- - each task obtains the fair share:		w_i/W of each full period P
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
- include/linux/sched.h |  3 +++
- kernel/sched/core.c   | 33 ++++++++++++++++++++++++++-------
- kernel/sched/fair.c   |  6 ++++--
- 3 files changed, 33 insertions(+), 9 deletions(-)
-
-diff --git a/include/linux/sched.h b/include/linux/sched.h
-index d40d98313..93c03b162 100644
--- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -555,6 +555,9 @@ struct sched_entity {
- 	struct list_head		group_node;
- 	unsigned int			on_rq;
-
-+	unsigned int			custom_slice : 1;
-+					/* 31 bits hole */
-+
- 	u64				exec_start;
- 	u64				sum_exec_runtime;
- 	u64				prev_sum_exec_runtime;
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index cfb0ffa69..1ae5a8272 100644
--- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -4502,7 +4502,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
- 	p->se.nr_migrations		= 0;
- 	p->se.vruntime			= 0;
- 	p->se.vlag			= 0;
-	p->se.slice			= sysctl_sched_base_slice;
- 	INIT_LIST_HEAD(&p->se.group_node);
-
- #ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -4756,6 +4755,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
-
- 		p->prio = p->normal_prio = p->static_prio;
- 		set_load_weight(p, false);
-+		p->se.custom_slice = 0;
-+		p->se.slice = sysctl_sched_base_slice;
-
- 		/*
- 		 * We don't need the reset flag anymore after the fork. It has
-@@ -7556,10 +7557,20 @@ static void __setscheduler_params(struct task_struct *p,
-
- 	p->policy = policy;
-
-	if (dl_policy(policy))
-+	if (dl_policy(policy)) {
- 		__setparam_dl(p, attr);
-	else if (fair_policy(policy))
-+	} else if (fair_policy(policy)) {
- 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
-+		if (attr->sched_runtime) {
-+			p->se.custom_slice = 1;
-+			p->se.slice = clamp_t(u64, attr->sched_runtime,
-+					      NSEC_PER_MSEC/10,   /* HZ=1000 * 10 */
-+					      NSEC_PER_MSEC*100); /* HZ=100  / 10 */
-+		} else {
-+			p->se.custom_slice = 0;
-+			p->se.slice = sysctl_sched_base_slice;
-+		}
-+	}
-
- 	/*
- 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
-@@ -7744,7 +7755,9 @@ static int __sched_setscheduler(struct task_struct *p,
- 	 * but store a possible modification of reset_on_fork.
- 	 */
- 	if (unlikely(policy == p->policy)) {
-		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
-+		if (fair_policy(policy) &&
-+		    (attr->sched_nice != task_nice(p) ||
-+		     (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
- 			goto change;
- 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
- 			goto change;
-@@ -7890,6 +7903,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
- 		.sched_nice	= PRIO_TO_NICE(p->static_prio),
- 	};
-
-+	if (p->se.custom_slice)
-+		attr.sched_runtime = p->se.slice;
-+
- 	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
- 	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
- 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
-@@ -8066,12 +8082,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
-
- static void get_params(struct task_struct *p, struct sched_attr *attr)
- {
-	if (task_has_dl_policy(p))
-+	if (task_has_dl_policy(p)) {
- 		__getparam_dl(p, attr);
-	else if (task_has_rt_policy(p))
-+	} else if (task_has_rt_policy(p)) {
- 		attr->sched_priority = p->rt_priority;
-	else
-+	} else {
- 		attr->sched_nice = task_nice(p);
-+		attr->sched_runtime = p->se.slice;
-+	}
- }
-
- /**
-@@ -10090,6 +10108,7 @@ void __init sched_init(void)
- 	}
-
- 	set_load_weight(&init_task, false);
-+	init_task.se.slice = sysctl_sched_base_slice,
-
- 	/*
- 	 * The boot idle thread does lazy MMU switching as well:
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 729507e40..51e19a1fb 100644
--- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -973,7 +973,8 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 	 * nice) while the request time r_i is determined by
- 	 * sysctl_sched_base_slice.
- 	 */
-	se->slice = sysctl_sched_base_slice;
-+	if (!se->custom_slice)
-+		se->slice = sysctl_sched_base_slice;
-
- 	/*
- 	 * EEVDF: vd_i = ve_i + r_i / w_i
-@@ -4921,7 +4922,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- 	u64 vslice, vruntime = avg_vruntime(cfs_rq);
- 	s64 lag = 0;
-
-	se->slice = sysctl_sched_base_slice;
-+	if (!se->custom_slice)
-+		se->slice = sysctl_sched_base_slice;
- 	vslice = calc_delta_fair(se->slice, se);
-
- 	/*
--
-2.42.0
-