Update EEVDF patches (#802)

2023-08-31 17:02:05 +02:00
parent 7b4bf31ffb
commit 2114c55a35
2 changed files with 2162 additions and 398 deletions
--- a/linux-tkg-patches/6.4/0003-eevdf.patch
+++ b/linux-tkg-patches/6.4/0003-eevdf.patch
@@ -2756,3 +2756,885 @@ index 7ff9965570e69..db5853761b1f3 100644
 --
 cgit

+From 246c6d7ab4d042b185d7df71f437137d43cbb83a Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Sat, 25 Mar 2023 00:14:04 +0100
+Subject: sched/eevdf: Better handle mixed slice length
+
+In the case where (due to latency-nice) there are different request
+sizes in the tree, the smaller requests tend to be dominated by the
+larger. Also note how the EEVDF lag limits are based on r_max.
+
+Therefore; add a heuristic that for the mixed request size case, moves
+smaller requests to placement strategy #2 which ensures they're
+immidiately eligible and and due to their smaller (virtual) deadline
+will cause preemption.
+
+NOTE: this relies on update_entity_lag() to impose lag limits above
+a single slice.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+---
+ kernel/sched/fair.c     | 39 +++++++++++++++++++++++++++++++++++++++
+ kernel/sched/features.h |  1 +
+ kernel/sched/sched.h    |  1 +
+ 3 files changed, 41 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 5c8c9f7d8496a..16949f7bbb172 100644
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	s64 key = entity_key(cfs_rq, se);
+
+ 	cfs_rq->avg_vruntime += key * weight;
+	cfs_rq->avg_slice += se->slice * weight;
+ 	cfs_rq->avg_load += weight;
+ }
+
+@@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	s64 key = entity_key(cfs_rq, se);
+
+ 	cfs_rq->avg_vruntime -= key * weight;
+	cfs_rq->avg_slice -= se->slice * weight;
+ 	cfs_rq->avg_load -= weight;
+ }
+
+@@ -4908,6 +4910,30 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
+
+ #endif /* CONFIG_SMP */
+
+static inline bool
+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags)
+{
+	u64 now, vdelta;
+	s64 delta;
+
+	if (!(flags & ENQUEUE_WAKEUP))
+		return false;
+
+	if (flags & ENQUEUE_MIGRATED)
+		return true;
+
+	now = rq_clock_task(rq_of(cfs_rq));
+	delta = now - se->exec_start;
+	if (delta < 0)
+		return false;
+
+	vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load);
+	if (vdelta < vslice)
+		return false;
+
+	return true;
+}
+
+ static void
+ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+@@ -4929,6 +4955,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+
+ 		lag = se->vlag;
+
+		/*
+		 * For latency sensitive tasks; those that have a shorter than
+		 * average slice and do not fully consume the slice, transition
+		 * to EEVDF placement strategy #2.
+		 */
+		if (sched_feat(PLACE_FUDGE) &&
+		    (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
+		    entity_has_slept(cfs_rq, se, vslice, flags)) {
+			lag += vslice;
+			if (lag > 0)
+				lag = 0;
+		}
+
+ 		/*
+ 		 * If we want to place a task and preserve lag, we have to
+ 		 * consider the effect of the new entity on the weighted
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 54334ca5c5c61..7d65b40299d91 100644
+--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
+@@ -5,6 +5,7 @@
+  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
+  */
+ SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_FUDGE, true)
+ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+
+ /*
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index db5853761b1f3..bc45beee335c5 100644
+--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
+@@ -549,6 +549,7 @@ struct cfs_rq {
+ 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
+
+ 	s64			avg_vruntime;
+	u64			avg_slice;
+ 	u64			avg_load;
+
+ 	u64			exec_clock;
+--
+cgit
+
+From 36b9081885fee5764b53970dd2d6afe8c2f13b7f Mon Sep 17 00:00:00 2001
+From: Parth Shah <parth@linux.ibm.com>
+Date: Sat, 11 Mar 2023 12:20:21 +0100
+Subject: sched: Introduce latency-nice as a per-task attribute
+
+Latency-nice indicates the latency requirements of a task with respect
+to the other tasks in the system. The value of the attribute can be within
+the range of [-20, 19] both inclusive to be in-line with the values just
+like task nice values.
+
+Just like task nice, -20 is the 'highest' priority and conveys this
+task should get minimal latency, conversely 19 is the lowest priority
+and conveys this task will get the least consideration and will thus
+receive maximal latency.
+
+[peterz: rebase, squash]
+Signed-off-by: Parth Shah <parth@linux.ibm.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+---
+ include/linux/sched.h            |  1 +
+ include/uapi/linux/sched.h       |  4 +++-
+ include/uapi/linux/sched/types.h | 19 +++++++++++++++++++
+ init/init_task.c                 |  3 ++-
+ kernel/sched/core.c              | 27 ++++++++++++++++++++++++++-
+ kernel/sched/debug.c             |  1 +
+ tools/include/uapi/linux/sched.h |  4 +++-
+ 7 files changed, 55 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 177b3f3676ef8..80bb40a63e9aa 100644
+--- a/include/linux/sched.h
+++ b/include/linux/sched.h
+@@ -790,6 +790,7 @@ struct task_struct {
+ 	int				static_prio;
+ 	int				normal_prio;
+ 	unsigned int			rt_priority;
+	int				latency_prio;
+
+ 	struct sched_entity		se;
+ 	struct sched_rt_entity		rt;
+diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
+index 3bac0a8ceab26..b2e932c25be62 100644
+--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
+@@ -132,6 +132,7 @@ struct clone_args {
+ #define SCHED_FLAG_KEEP_PARAMS		0x10
+ #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+ #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
+#define SCHED_FLAG_LATENCY_NICE		0x80
+
+ #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+ 				 SCHED_FLAG_KEEP_PARAMS)
+@@ -143,6 +144,7 @@ struct clone_args {
+ 			 SCHED_FLAG_RECLAIM		| \
+ 			 SCHED_FLAG_DL_OVERRUN		| \
+ 			 SCHED_FLAG_KEEP_ALL		| \
+-			 SCHED_FLAG_UTIL_CLAMP)
+			 SCHED_FLAG_UTIL_CLAMP		| \
+			 SCHED_FLAG_LATENCY_NICE)
+
+ #endif /* _UAPI_LINUX_SCHED_H */
+diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
+index f2c4589d4dbfe..db1e8199e8c80 100644
+--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
+@@ -10,6 +10,7 @@ struct sched_param {
+
+ #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
+ #define SCHED_ATTR_SIZE_VER1	56	/* add: util_{min,max} */
+#define SCHED_ATTR_SIZE_VER2	60	/* add: latency_nice */
+
+ /*
+  * Extended scheduling parameters data structure.
+@@ -98,6 +99,22 @@ struct sched_param {
+  * scheduled on a CPU with no more capacity than the specified value.
+  *
+  * A task utilization boundary can be reset by setting the attribute to -1.
+ *
+ * Latency Tolerance Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes allows to specify the relative latency
+ * requirements of a task with respect to the other tasks running/queued in the
+ * system.
+ *
+ * @ sched_latency_nice	task's latency_nice value
+ *
+ * The latency_nice of a task can have any value in a range of
+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
+ *
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
+ * taken for a task requiring a lower latency as opposed to the task with
+ * higher latency_nice.
+  */
+ struct sched_attr {
+ 	__u32 size;
+@@ -120,6 +137,8 @@ struct sched_attr {
+ 	__u32 sched_util_min;
+ 	__u32 sched_util_max;
+
+	/* latency requirement hints */
+	__s32 sched_latency_nice;
+ };
+
+ #endif /* _UAPI_LINUX_SCHED_TYPES_H */
+diff --git a/init/init_task.c b/init/init_task.c
+index ff6c4b9bfe6b1..511cbcf3510dc 100644
+--- a/init/init_task.c
+++ b/init/init_task.c
+@@ -78,6 +78,7 @@ struct task_struct init_task
+ 	.prio		= MAX_PRIO - 20,
+ 	.static_prio	= MAX_PRIO - 20,
+ 	.normal_prio	= MAX_PRIO - 20,
+	.latency_prio	= DEFAULT_PRIO,
+ 	.policy		= SCHED_NORMAL,
+ 	.cpus_ptr	= &init_task.cpus_mask,
+ 	.user_cpus_ptr	= NULL,
+@@ -89,7 +90,7 @@ struct task_struct init_task
+ 		.fn = do_no_restart_syscall,
+ 	},
+ 	.se		= {
+-		.group_node 	= LIST_HEAD_INIT(init_task.se.group_node),
+		.group_node	= LIST_HEAD_INIT(init_task.se.group_node),
+ 	},
+ 	.rt		= {
+ 		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index a5d3422f7d0de..b3533d0d4a2ca 100644
+--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
+@@ -4757,6 +4757,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 		p->prio = p->normal_prio = p->static_prio;
+ 		set_load_weight(p, false);
+
+		p->latency_prio = NICE_TO_PRIO(0);
+
+ 		/*
+ 		 * We don't need the reset flag anymore after the fork. It has
+ 		 * fulfilled its duty:
+@@ -7531,7 +7533,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
+ #define SETPARAM_POLICY	-1
+
+ static void __setscheduler_params(struct task_struct *p,
+-		const struct sched_attr *attr)
+				  const struct sched_attr *attr)
+ {
+ 	int policy = attr->sched_policy;
+
+@@ -7555,6 +7557,13 @@ static void __setscheduler_params(struct task_struct *p,
+ 	set_load_weight(p, true);
+ }
+
+static void __setscheduler_latency(struct task_struct *p,
+				   const struct sched_attr *attr)
+{
+	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
+		p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
+}
+
+ /*
+  * Check the target process has a UID that matches the current process's:
+  */
+@@ -7689,6 +7698,13 @@ recheck:
+ 			return retval;
+ 	}
+
+	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
+		if (attr->sched_latency_nice > MAX_NICE)
+			return -EINVAL;
+		if (attr->sched_latency_nice < MIN_NICE)
+			return -EINVAL;
+	}
+
+ 	/* Update task specific "requested" clamps */
+ 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ 		retval = uclamp_validate(p, attr);
+@@ -7736,6 +7752,9 @@ recheck:
+ 			goto change;
+ 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ 			goto change;
+		if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
+		    attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
+			goto change;
+
+ 		p->sched_reset_on_fork = reset_on_fork;
+ 		retval = 0;
+@@ -7824,6 +7843,7 @@ change:
+ 		__setscheduler_params(p, attr);
+ 		__setscheduler_prio(p, newprio);
+ 	}
+	__setscheduler_latency(p, attr);
+ 	__setscheduler_uclamp(p, attr);
+
+ 	if (queued) {
+@@ -8035,6 +8055,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
+ 	    size < SCHED_ATTR_SIZE_VER1)
+ 		return -EINVAL;
+
+	if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
+	    size < SCHED_ATTR_SIZE_VER2)
+		return -EINVAL;
+ 	/*
+ 	 * XXX: Do we want to be lenient like existing syscalls; or do we want
+ 	 * to be strict and return an error on out-of-bounds values?
+@@ -8272,6 +8295,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ 	get_params(p, &kattr);
+ 	kattr.sched_flags &= SCHED_FLAG_ALL;
+
+	kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio);
+
+ #ifdef CONFIG_UCLAMP_TASK
+ 	/*
+ 	 * This could race with another potential updater, but this is fine
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 4c3d0d9f3db63..5c743bcb340d2 100644
+--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
+@@ -1086,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ #endif
+ 	P(policy);
+ 	P(prio);
+	P(latency_prio);
+ 	if (task_has_dl_policy(p)) {
+ 		P(dl.runtime);
+ 		P(dl.deadline);
+diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
+index 3bac0a8ceab26..b2e932c25be62 100644
+--- a/tools/include/uapi/linux/sched.h
+++ b/tools/include/uapi/linux/sched.h
+@@ -132,6 +132,7 @@ struct clone_args {
+ #define SCHED_FLAG_KEEP_PARAMS		0x10
+ #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+ #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
+#define SCHED_FLAG_LATENCY_NICE		0x80
+
+ #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+ 				 SCHED_FLAG_KEEP_PARAMS)
+@@ -143,6 +144,7 @@ struct clone_args {
+ 			 SCHED_FLAG_RECLAIM		| \
+ 			 SCHED_FLAG_DL_OVERRUN		| \
+ 			 SCHED_FLAG_KEEP_ALL		| \
+-			 SCHED_FLAG_UTIL_CLAMP)
+			 SCHED_FLAG_UTIL_CLAMP		| \
+			 SCHED_FLAG_LATENCY_NICE)
+
+ #endif /* _UAPI_LINUX_SCHED_H */
+--
+cgit
+
+From 9f9a3323112d3aa5afa466b1e391e137f28dc79d Mon Sep 17 00:00:00 2001
+From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
+Date: Fri, 24 Feb 2023 10:34:51 +0100
+Subject: sched/fair: Implement latency-nice
+
+Implement latency-nice as a modulation of the EEVDF r_i parameter,
+specifically apply the inverse sched_prio_to_weight[] relation on
+base_slice.
+
+Given a base slice of 3 [ms], this gives a range of:
+
+  latency-nice  19: 3*1024 / 15    ~= 204.8 [ms]
+  latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms]
+
+(which might not make sense)
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
+---
+ kernel/sched/core.c  | 14 ++++++++++----
+ kernel/sched/fair.c  | 22 +++++++++++++++-------
+ kernel/sched/sched.h |  2 ++
+ 3 files changed, 27 insertions(+), 11 deletions(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index b3533d0d4a2ca..263caac8f76b7 100644
+--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
+@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
+ 	}
+ }
+
+static inline void set_latency_prio(struct task_struct *p, int prio)
+{
+	p->latency_prio = prio;
+	set_latency_fair(&p->se, prio - MAX_RT_PRIO);
+}
+
+ #ifdef CONFIG_UCLAMP_TASK
+ /*
+  * Serializes updates of utilization clamp values
+@@ -4502,9 +4508,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.nr_migrations		= 0;
+ 	p->se.vruntime			= 0;
+ 	p->se.vlag			= 0;
+-	p->se.slice			= sysctl_sched_base_slice;
+ 	INIT_LIST_HEAD(&p->se.group_node);
+
+	set_latency_prio(p, p->latency_prio);
+
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	p->se.cfs_rq			= NULL;
+ #endif
+@@ -4756,8 +4763,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+
+ 		p->prio = p->normal_prio = p->static_prio;
+ 		set_load_weight(p, false);
+-
+-		p->latency_prio = NICE_TO_PRIO(0);
+		set_latency_prio(p, NICE_TO_PRIO(0));
+
+ 		/*
+ 		 * We don't need the reset flag anymore after the fork. It has
+@@ -7561,7 +7567,7 @@ static void __setscheduler_latency(struct task_struct *p,
+ 				   const struct sched_attr *attr)
+ {
+ 	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
+-		p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
+		set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
+ }
+
+ /*
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 16949f7bbb172..c2019e7d46cf5 100644
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -952,6 +952,21 @@ int sched_update_scaling(void)
+ }
+ #endif
+
+void set_latency_fair(struct sched_entity *se, int prio)
+{
+	u32 weight = sched_prio_to_weight[prio];
+	u64 base = sysctl_sched_base_slice;
+
+	/*
+	 * For EEVDF the virtual time slope is determined by w_i (iow.
+	 * nice) while the request time r_i is determined by
+	 * latency-nice.
+	 *
+	 * Smaller request gets better latency.
+	 */
+	se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
+}
+
+ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
+ /*
+@@ -963,13 +978,6 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	if ((s64)(se->vruntime - se->deadline) < 0)
+ 		return;
+
+-	/*
+-	 * For EEVDF the virtual time slope is determined by w_i (iow.
+-	 * nice) while the request time r_i is determined by
+-	 * sysctl_sched_base_slice.
+-	 */
+-	se->slice = sysctl_sched_base_slice;
+-
+ 	/*
+ 	 * EEVDF: vd_i = ve_i + r_i / w_i
+ 	 */
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index bc45beee335c5..8f8d903a01892 100644
+--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
+@@ -2520,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #endif
+
+extern void set_latency_fair(struct sched_entity *se, int prio);
+
+ #ifdef CONFIG_SCHED_HRTICK
+
+ /*
+--
+cgit
+
+From a317f35154852bc023a7ab2e3fa491e1897af72f Mon Sep 17 00:00:00 2001
+From: Vincent Guittot <vincent.guittot@linaro.org>
+Date: Fri, 24 Feb 2023 10:34:52 +0100
+Subject: sched/fair: Add sched group latency support
+
+Task can set its latency priority with sched_setattr(), which is then used
+to set the latency offset of its sched_enity, but sched group entities
+still have the default latency offset value.
+
+Add a latency.nice field in cpu cgroup controller to set the latency
+priority of the group similarly to sched_setattr(). The latency priority
+is then used to set the offset of the sched_entities of the group.
+
+Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Link: https://lkml.kernel.org/r/20230224093454.956298-7-vincent.guittot@linaro.org
+---
+ Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++
+ kernel/sched/core.c                     | 30 ++++++++++++++++++++++++++++++
+ kernel/sched/fair.c                     | 27 +++++++++++++++++++++++++++
+ kernel/sched/sched.h                    |  4 ++++
+ 4 files changed, 71 insertions(+)
+
+diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
+index 4ef8901911961..3a8d3e1e55910 100644
+--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
+@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
+         values similar to the sched_setattr(2). This maximum utilization
+         value is used to clamp the task specific maximum utilization clamp.
+
+  cpu.latency.nice
+	A read-write single value file which exists on non-root
+	cgroups.  The default is "0".
+
+	The nice value is in the range [-20, 19].
+
+	This interface file allows reading and setting latency using the
+	same values used by sched_setattr(2). The latency_nice of a group is
+	used to limit the impact of the latency_nice of a task outside the
+	group.
+
+
+ Memory
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 263caac8f76b7..8a541fe2d4626 100644
+--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
+@@ -11247,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ {
+ 	return sched_group_set_idle(css_tg(css), idle);
+ }
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+				    struct cftype *cft)
+{
+	return PRIO_TO_NICE(css_tg(css)->latency_prio);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+				     struct cftype *cft, s64 nice)
+{
+	int prio;
+
+	if (nice < MIN_NICE || nice > MAX_NICE)
+		return -ERANGE;
+
+	prio = NICE_TO_PRIO(nice);
+
+	return sched_group_set_latency(css_tg(css), prio);
+}
+ #endif
+
+ static struct cftype cpu_legacy_files[] = {
+@@ -11261,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = {
+ 		.read_s64 = cpu_idle_read_s64,
+ 		.write_s64 = cpu_idle_write_s64,
+ 	},
+	{
+		.name = "latency.nice",
+		.read_s64 = cpu_latency_nice_read_s64,
+		.write_s64 = cpu_latency_nice_write_s64,
+	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+@@ -11500,6 +11524,12 @@ static struct cftype cpu_files[] = {
+ 		.read_s64 = cpu_idle_read_s64,
+ 		.write_s64 = cpu_idle_write_s64,
+ 	},
+	{
+		.name = "latency.nice",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_latency_nice_read_s64,
+		.write_s64 = cpu_latency_nice_write_s64,
+	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index c2019e7d46cf5..8a4799c600309 100644
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -12545,6 +12545,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+ 		goto err;
+
+ 	tg->shares = NICE_0_LOAD;
+	tg->latency_prio = DEFAULT_PRIO;
+
+ 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+@@ -12643,6 +12644,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ 	}
+
+ 	se->my_q = cfs_rq;
+
+	set_latency_fair(se, tg->latency_prio - MAX_RT_PRIO);
+
+ 	/* guarantee group entities always have weight */
+ 	update_load_set(&se->load, NICE_0_LOAD);
+ 	se->parent = parent;
+@@ -12773,6 +12777,29 @@ next_cpu:
+ 	return 0;
+ }
+
+int sched_group_set_latency(struct task_group *tg, int prio)
+{
+	int i;
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	mutex_lock(&shares_mutex);
+
+	if (tg->latency_prio == prio) {
+		mutex_unlock(&shares_mutex);
+		return 0;
+	}
+
+	tg->latency_prio = prio;
+
+	for_each_possible_cpu(i)
+		set_latency_fair(tg->se[i], prio - MAX_RT_PRIO);
+
+	mutex_unlock(&shares_mutex);
+	return 0;
+}
+
+ #else /* CONFIG_FAIR_GROUP_SCHED */
+
+ void free_fair_sched_group(struct task_group *tg) { }
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 8f8d903a01892..4236c4c893aa7 100644
+--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
+@@ -372,6 +372,8 @@ struct task_group {
+
+ 	/* A positive value indicates that this is a SCHED_IDLE group. */
+ 	int			idle;
+	/* latency priority of the group. */
+	int			latency_prio;
+
+ #ifdef	CONFIG_SMP
+ 	/*
+@@ -482,6 +484,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+
+ extern int sched_group_set_idle(struct task_group *tg, long idle);
+
+extern int sched_group_set_latency(struct task_group *tg, int prio);
+
+ #ifdef CONFIG_SMP
+ extern void set_task_rq_fair(struct sched_entity *se,
+ 			     struct cfs_rq *prev, struct cfs_rq *next);
+--
+cgit
+
+From b412068f928064d23f67709f46d36d7659079e54 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Mon, 22 May 2023 13:46:30 +0200
+Subject: sched/eevdf: Use sched_attr::sched_runtime to set request/slice
+
+As an alternative to the latency-nice interface; allow applications to
+directly set the request/slice using sched_attr::sched_runtime.
+
+The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
+which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
+
+Applications should strive to use their periodic runtime at a high
+confidence interval (95%+) as the target slice. Using a smaller slice
+will introduce undue preemptions, while using a larger value will
+increase latency.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+---
+ kernel/sched/core.c | 24 ++++++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 8a541fe2d4626..5b71c398f6cf6 100644
+--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
+@@ -7548,10 +7548,18 @@ static void __setscheduler_params(struct task_struct *p,
+
+ 	p->policy = policy;
+
+-	if (dl_policy(policy))
+	if (dl_policy(policy)) {
+ 		__setparam_dl(p, attr);
+-	else if (fair_policy(policy))
+	} else if (fair_policy(policy)) {
+ 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+		if (attr->sched_runtime) {
+			p->se.slice = clamp_t(u64, attr->sched_runtime,
+					      NSEC_PER_MSEC/10,   /* HZ=1000 * 10 */
+					      NSEC_PER_MSEC*100); /* HZ=100  / 10 */
+		} else {
+			p->se.slice = sysctl_sched_base_slice;
+		}
+	}
+
+ 	/*
+ 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
+@@ -7750,7 +7758,9 @@ recheck:
+ 	 * but store a possible modification of reset_on_fork.
+ 	 */
+ 	if (unlikely(policy == p->policy)) {
+-		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+		if (fair_policy(policy) &&
+		    (attr->sched_nice != task_nice(p) ||
+		     (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
+ 			goto change;
+ 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ 			goto change;
+@@ -8079,12 +8089,14 @@ err_size:
+
+ static void get_params(struct task_struct *p, struct sched_attr *attr)
+ {
+-	if (task_has_dl_policy(p))
+	if (task_has_dl_policy(p)) {
+ 		__getparam_dl(p, attr);
+-	else if (task_has_rt_policy(p))
+	} else if (task_has_rt_policy(p)) {
+ 		attr->sched_priority = p->rt_priority;
+-	else
+	} else {
+ 		attr->sched_nice = task_nice(p);
+		attr->sched_runtime = p->se.slice;
+	}
+ }
+
+ /**
+--
+cgit
+
+From 2f88c8e802c8b128a155976631f4eb2ce4f3c805 Mon Sep 17 00:00:00 2001
+From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
+Date: Thu, 24 Aug 2023 13:33:42 +0530
+Subject: sched/eevdf/doc: Modify the documented knob to base_slice_ns as well
+
+After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns'
+sysctl to 'base_slice_ns':
+
+   e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
+
+... but we forgot to rename it in the documentation. Do that now.
+
+Fixes: e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
+Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20230824080342.543396-1-sshegde@linux.vnet.ibm.com
+---
+ Documentation/scheduler/sched-design-CFS.rst | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
+index 03db555045151..f68919800f050 100644
+--- a/Documentation/scheduler/sched-design-CFS.rst
+++ b/Documentation/scheduler/sched-design-CFS.rst
+@@ -94,7 +94,7 @@ other HZ detail.  Thus the CFS scheduler has no notion of "timeslices" in the
+ way the previous scheduler had, and has no heuristics whatsoever.  There is
+ only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
+
+-   /sys/kernel/debug/sched/min_granularity_ns
+   /sys/kernel/debug/sched/base_slice_ns
+
+ which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
+ "server" (i.e., good batching) workloads.  It defaults to a setting suitable
+--
+cgit
+
+From 63304558ba5dcaaff9e052ee43cfdcc7f9c29e85 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 16 Aug 2023 15:40:59 +0200
+Subject: sched/eevdf: Curb wakeup-preemption
+
+Mike and others noticed that EEVDF does like to over-schedule quite a
+bit -- which does hurt performance of a number of benchmarks /
+workloads.
+
+In particular, what seems to cause over-scheduling is that when lag is
+of the same order (or larger) than the request / slice then placement
+will not only cause the task to be placed left of current, but also
+with a smaller deadline than current, which causes immediate
+preemption.
+
+[ notably, lag bounds are relative to HZ ]
+
+Mike suggested we stick to picking 'current' for as long as it's
+eligible to run, giving it uninterrupted runtime until it reaches
+parity with the pack.
+
+Augment Mike's suggestion by only allowing it to exhaust it's initial
+request.
+
+One random data point:
+
+echo NO_RUN_TO_PARITY > /debug/sched/features
+perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
+
+	3,723,554        context-switches      ( +-  0.56% )
+	9.5136 +- 0.0394 seconds time elapsed  ( +-  0.41% )
+
+echo RUN_TO_PARITY > /debug/sched/features
+perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
+
+	2,556,535        context-switches      ( +-  0.51% )
+	9.2427 +- 0.0302 seconds time elapsed  ( +-  0.33% )
+
+Suggested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net
+---
+ kernel/sched/fair.c     | 12 ++++++++++++
+ kernel/sched/features.h |  1 +
+ 2 files changed, 13 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index f496cef90ce77..0b7445cd5af98 100644
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -873,6 +873,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+ 	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ 		curr = NULL;
+
+	/*
+	 * Once selected, run a task until it either becomes non-eligible or
+	 * until it gets a new slice. See the HACK in set_next_entity().
+	 */
+	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+		return curr;
+
+ 	while (node) {
+ 		struct sched_entity *se = __node_2_se(node);
+
+@@ -5167,6 +5174,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 		update_stats_wait_end_fair(cfs_rq, se);
+ 		__dequeue_entity(cfs_rq, se);
+ 		update_load_avg(cfs_rq, se, UPDATE_TG);
+		/*
+		 * HACK, stash a copy of deadline at the point of pick in vlag,
+		 * which isn't used until dequeue.
+		 */
+		se->vlag = se->deadline;
+ 	}
+
+ 	update_stats_curr_start(cfs_rq, se);
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 61bcbf5e46a45..f770168230ae4 100644
+--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
+@@ -6,6 +6,7 @@
+  */
+ SCHED_FEAT(PLACE_LAG, true)
+ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+SCHED_FEAT(RUN_TO_PARITY, true)
+
+ /*
+  * Prefer to schedule the task we woke last (assuming it failed
+--
+cgit
+
--- a/linux-tkg-patches/6.5/0003-eevdf.patch
+++ b/linux-tkg-patches/6.5/0003-eevdf.patch
@@ -2756,3 +2756,885 @@ index 7ff9965570e69..db5853761b1f3 100644
 --
 cgit

+From 246c6d7ab4d042b185d7df71f437137d43cbb83a Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Sat, 25 Mar 2023 00:14:04 +0100
+Subject: sched/eevdf: Better handle mixed slice length
+
+In the case where (due to latency-nice) there are different request
+sizes in the tree, the smaller requests tend to be dominated by the
+larger. Also note how the EEVDF lag limits are based on r_max.
+
+Therefore; add a heuristic that for the mixed request size case, moves
+smaller requests to placement strategy #2 which ensures they're
+immidiately eligible and and due to their smaller (virtual) deadline
+will cause preemption.
+
+NOTE: this relies on update_entity_lag() to impose lag limits above
+a single slice.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+---
+ kernel/sched/fair.c     | 39 +++++++++++++++++++++++++++++++++++++++
+ kernel/sched/features.h |  1 +
+ kernel/sched/sched.h    |  1 +
+ 3 files changed, 41 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 5c8c9f7d8496a..16949f7bbb172 100644
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	s64 key = entity_key(cfs_rq, se);
+
+ 	cfs_rq->avg_vruntime += key * weight;
+	cfs_rq->avg_slice += se->slice * weight;
+ 	cfs_rq->avg_load += weight;
+ }
+
+@@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	s64 key = entity_key(cfs_rq, se);
+
+ 	cfs_rq->avg_vruntime -= key * weight;
+	cfs_rq->avg_slice -= se->slice * weight;
+ 	cfs_rq->avg_load -= weight;
+ }
+
+@@ -4908,6 +4910,30 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
+
+ #endif /* CONFIG_SMP */
+
+static inline bool
+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags)
+{
+	u64 now, vdelta;
+	s64 delta;
+
+	if (!(flags & ENQUEUE_WAKEUP))
+		return false;
+
+	if (flags & ENQUEUE_MIGRATED)
+		return true;
+
+	now = rq_clock_task(rq_of(cfs_rq));
+	delta = now - se->exec_start;
+	if (delta < 0)
+		return false;
+
+	vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load);
+	if (vdelta < vslice)
+		return false;
+
+	return true;
+}
+
+ static void
+ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+@@ -4929,6 +4955,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+
+ 		lag = se->vlag;
+
+		/*
+		 * For latency sensitive tasks; those that have a shorter than
+		 * average slice and do not fully consume the slice, transition
+		 * to EEVDF placement strategy #2.
+		 */
+		if (sched_feat(PLACE_FUDGE) &&
+		    (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
+		    entity_has_slept(cfs_rq, se, vslice, flags)) {
+			lag += vslice;
+			if (lag > 0)
+				lag = 0;
+		}
+
+ 		/*
+ 		 * If we want to place a task and preserve lag, we have to
+ 		 * consider the effect of the new entity on the weighted
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 54334ca5c5c61..7d65b40299d91 100644
+--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
+@@ -5,6 +5,7 @@
+  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
+  */
+ SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_FUDGE, true)
+ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+
+ /*
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index db5853761b1f3..bc45beee335c5 100644
+--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
+@@ -549,6 +549,7 @@ struct cfs_rq {
+ 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
+
+ 	s64			avg_vruntime;
+	u64			avg_slice;
+ 	u64			avg_load;
+
+ 	u64			exec_clock;
+--
+cgit
+
+From 36b9081885fee5764b53970dd2d6afe8c2f13b7f Mon Sep 17 00:00:00 2001
+From: Parth Shah <parth@linux.ibm.com>
+Date: Sat, 11 Mar 2023 12:20:21 +0100
+Subject: sched: Introduce latency-nice as a per-task attribute
+
+Latency-nice indicates the latency requirements of a task with respect
+to the other tasks in the system. The value of the attribute can be within
+the range of [-20, 19] both inclusive to be in-line with the values just
+like task nice values.
+
+Just like task nice, -20 is the 'highest' priority and conveys this
+task should get minimal latency, conversely 19 is the lowest priority
+and conveys this task will get the least consideration and will thus
+receive maximal latency.
+
+[peterz: rebase, squash]
+Signed-off-by: Parth Shah <parth@linux.ibm.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+---
+ include/linux/sched.h            |  1 +
+ include/uapi/linux/sched.h       |  4 +++-
+ include/uapi/linux/sched/types.h | 19 +++++++++++++++++++
+ init/init_task.c                 |  3 ++-
+ kernel/sched/core.c              | 27 ++++++++++++++++++++++++++-
+ kernel/sched/debug.c             |  1 +
+ tools/include/uapi/linux/sched.h |  4 +++-
+ 7 files changed, 55 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 177b3f3676ef8..80bb40a63e9aa 100644
+--- a/include/linux/sched.h
+++ b/include/linux/sched.h
+@@ -790,6 +790,7 @@ struct task_struct {
+ 	int				static_prio;
+ 	int				normal_prio;
+ 	unsigned int			rt_priority;
+	int				latency_prio;
+
+ 	struct sched_entity		se;
+ 	struct sched_rt_entity		rt;
+diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
+index 3bac0a8ceab26..b2e932c25be62 100644
+--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
+@@ -132,6 +132,7 @@ struct clone_args {
+ #define SCHED_FLAG_KEEP_PARAMS		0x10
+ #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+ #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
+#define SCHED_FLAG_LATENCY_NICE		0x80
+
+ #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+ 				 SCHED_FLAG_KEEP_PARAMS)
+@@ -143,6 +144,7 @@ struct clone_args {
+ 			 SCHED_FLAG_RECLAIM		| \
+ 			 SCHED_FLAG_DL_OVERRUN		| \
+ 			 SCHED_FLAG_KEEP_ALL		| \
+-			 SCHED_FLAG_UTIL_CLAMP)
+			 SCHED_FLAG_UTIL_CLAMP		| \
+			 SCHED_FLAG_LATENCY_NICE)
+
+ #endif /* _UAPI_LINUX_SCHED_H */
+diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
+index f2c4589d4dbfe..db1e8199e8c80 100644
+--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
+@@ -10,6 +10,7 @@ struct sched_param {
+
+ #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
+ #define SCHED_ATTR_SIZE_VER1	56	/* add: util_{min,max} */
+#define SCHED_ATTR_SIZE_VER2	60	/* add: latency_nice */
+
+ /*
+  * Extended scheduling parameters data structure.
+@@ -98,6 +99,22 @@ struct sched_param {
+  * scheduled on a CPU with no more capacity than the specified value.
+  *
+  * A task utilization boundary can be reset by setting the attribute to -1.
+ *
+ * Latency Tolerance Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes allows to specify the relative latency
+ * requirements of a task with respect to the other tasks running/queued in the
+ * system.
+ *
+ * @ sched_latency_nice	task's latency_nice value
+ *
+ * The latency_nice of a task can have any value in a range of
+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
+ *
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
+ * taken for a task requiring a lower latency as opposed to the task with
+ * higher latency_nice.
+  */
+ struct sched_attr {
+ 	__u32 size;
+@@ -120,6 +137,8 @@ struct sched_attr {
+ 	__u32 sched_util_min;
+ 	__u32 sched_util_max;
+
+	/* latency requirement hints */
+	__s32 sched_latency_nice;
+ };
+
+ #endif /* _UAPI_LINUX_SCHED_TYPES_H */
+diff --git a/init/init_task.c b/init/init_task.c
+index ff6c4b9bfe6b1..511cbcf3510dc 100644
+--- a/init/init_task.c
+++ b/init/init_task.c
+@@ -78,6 +78,7 @@ struct task_struct init_task
+ 	.prio		= MAX_PRIO - 20,
+ 	.static_prio	= MAX_PRIO - 20,
+ 	.normal_prio	= MAX_PRIO - 20,
+	.latency_prio	= DEFAULT_PRIO,
+ 	.policy		= SCHED_NORMAL,
+ 	.cpus_ptr	= &init_task.cpus_mask,
+ 	.user_cpus_ptr	= NULL,
+@@ -89,7 +90,7 @@ struct task_struct init_task
+ 		.fn = do_no_restart_syscall,
+ 	},
+ 	.se		= {
+-		.group_node 	= LIST_HEAD_INIT(init_task.se.group_node),
+		.group_node	= LIST_HEAD_INIT(init_task.se.group_node),
+ 	},
+ 	.rt		= {
+ 		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index a5d3422f7d0de..b3533d0d4a2ca 100644
+--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
+@@ -4757,6 +4757,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 		p->prio = p->normal_prio = p->static_prio;
+ 		set_load_weight(p, false);
+
+		p->latency_prio = NICE_TO_PRIO(0);
+
+ 		/*
+ 		 * We don't need the reset flag anymore after the fork. It has
+ 		 * fulfilled its duty:
+@@ -7531,7 +7533,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
+ #define SETPARAM_POLICY	-1
+
+ static void __setscheduler_params(struct task_struct *p,
+-		const struct sched_attr *attr)
+				  const struct sched_attr *attr)
+ {
+ 	int policy = attr->sched_policy;
+
+@@ -7555,6 +7557,13 @@ static void __setscheduler_params(struct task_struct *p,
+ 	set_load_weight(p, true);
+ }
+
+static void __setscheduler_latency(struct task_struct *p,
+				   const struct sched_attr *attr)
+{
+	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
+		p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
+}
+
+ /*
+  * Check the target process has a UID that matches the current process's:
+  */
+@@ -7689,6 +7698,13 @@ recheck:
+ 			return retval;
+ 	}
+
+	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
+		if (attr->sched_latency_nice > MAX_NICE)
+			return -EINVAL;
+		if (attr->sched_latency_nice < MIN_NICE)
+			return -EINVAL;
+	}
+
+ 	/* Update task specific "requested" clamps */
+ 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ 		retval = uclamp_validate(p, attr);
+@@ -7736,6 +7752,9 @@ recheck:
+ 			goto change;
+ 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ 			goto change;
+		if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
+		    attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
+			goto change;
+
+ 		p->sched_reset_on_fork = reset_on_fork;
+ 		retval = 0;
+@@ -7824,6 +7843,7 @@ change:
+ 		__setscheduler_params(p, attr);
+ 		__setscheduler_prio(p, newprio);
+ 	}
+	__setscheduler_latency(p, attr);
+ 	__setscheduler_uclamp(p, attr);
+
+ 	if (queued) {
+@@ -8035,6 +8055,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
+ 	    size < SCHED_ATTR_SIZE_VER1)
+ 		return -EINVAL;
+
+	if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
+	    size < SCHED_ATTR_SIZE_VER2)
+		return -EINVAL;
+ 	/*
+ 	 * XXX: Do we want to be lenient like existing syscalls; or do we want
+ 	 * to be strict and return an error on out-of-bounds values?
+@@ -8272,6 +8295,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ 	get_params(p, &kattr);
+ 	kattr.sched_flags &= SCHED_FLAG_ALL;
+
+	kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio);
+
+ #ifdef CONFIG_UCLAMP_TASK
+ 	/*
+ 	 * This could race with another potential updater, but this is fine
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 4c3d0d9f3db63..5c743bcb340d2 100644
+--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
+@@ -1086,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ #endif
+ 	P(policy);
+ 	P(prio);
+	P(latency_prio);
+ 	if (task_has_dl_policy(p)) {
+ 		P(dl.runtime);
+ 		P(dl.deadline);
+diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
+index 3bac0a8ceab26..b2e932c25be62 100644
+--- a/tools/include/uapi/linux/sched.h
+++ b/tools/include/uapi/linux/sched.h
+@@ -132,6 +132,7 @@ struct clone_args {
+ #define SCHED_FLAG_KEEP_PARAMS		0x10
+ #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+ #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
+#define SCHED_FLAG_LATENCY_NICE		0x80
+
+ #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+ 				 SCHED_FLAG_KEEP_PARAMS)
+@@ -143,6 +144,7 @@ struct clone_args {
+ 			 SCHED_FLAG_RECLAIM		| \
+ 			 SCHED_FLAG_DL_OVERRUN		| \
+ 			 SCHED_FLAG_KEEP_ALL		| \
+-			 SCHED_FLAG_UTIL_CLAMP)
+			 SCHED_FLAG_UTIL_CLAMP		| \
+			 SCHED_FLAG_LATENCY_NICE)
+
+ #endif /* _UAPI_LINUX_SCHED_H */
+--
+cgit
+
+From 9f9a3323112d3aa5afa466b1e391e137f28dc79d Mon Sep 17 00:00:00 2001
+From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
+Date: Fri, 24 Feb 2023 10:34:51 +0100
+Subject: sched/fair: Implement latency-nice
+
+Implement latency-nice as a modulation of the EEVDF r_i parameter,
+specifically apply the inverse sched_prio_to_weight[] relation on
+base_slice.
+
+Given a base slice of 3 [ms], this gives a range of:
+
+  latency-nice  19: 3*1024 / 15    ~= 204.8 [ms]
+  latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms]
+
+(which might not make sense)
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
+---
+ kernel/sched/core.c  | 14 ++++++++++----
+ kernel/sched/fair.c  | 22 +++++++++++++++-------
+ kernel/sched/sched.h |  2 ++
+ 3 files changed, 27 insertions(+), 11 deletions(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index b3533d0d4a2ca..263caac8f76b7 100644
+--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
+@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
+ 	}
+ }
+
+static inline void set_latency_prio(struct task_struct *p, int prio)
+{
+	p->latency_prio = prio;
+	set_latency_fair(&p->se, prio - MAX_RT_PRIO);
+}
+
+ #ifdef CONFIG_UCLAMP_TASK
+ /*
+  * Serializes updates of utilization clamp values
+@@ -4502,9 +4508,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.nr_migrations		= 0;
+ 	p->se.vruntime			= 0;
+ 	p->se.vlag			= 0;
+-	p->se.slice			= sysctl_sched_base_slice;
+ 	INIT_LIST_HEAD(&p->se.group_node);
+
+	set_latency_prio(p, p->latency_prio);
+
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	p->se.cfs_rq			= NULL;
+ #endif
+@@ -4756,8 +4763,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+
+ 		p->prio = p->normal_prio = p->static_prio;
+ 		set_load_weight(p, false);
+-
+-		p->latency_prio = NICE_TO_PRIO(0);
+		set_latency_prio(p, NICE_TO_PRIO(0));
+
+ 		/*
+ 		 * We don't need the reset flag anymore after the fork. It has
+@@ -7561,7 +7567,7 @@ static void __setscheduler_latency(struct task_struct *p,
+ 				   const struct sched_attr *attr)
+ {
+ 	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
+-		p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
+		set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
+ }
+
+ /*
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 16949f7bbb172..c2019e7d46cf5 100644
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -952,6 +952,21 @@ int sched_update_scaling(void)
+ }
+ #endif
+
+void set_latency_fair(struct sched_entity *se, int prio)
+{
+	u32 weight = sched_prio_to_weight[prio];
+	u64 base = sysctl_sched_base_slice;
+
+	/*
+	 * For EEVDF the virtual time slope is determined by w_i (iow.
+	 * nice) while the request time r_i is determined by
+	 * latency-nice.
+	 *
+	 * Smaller request gets better latency.
+	 */
+	se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
+}
+
+ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
+ /*
+@@ -963,13 +978,6 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	if ((s64)(se->vruntime - se->deadline) < 0)
+ 		return;
+
+-	/*
+-	 * For EEVDF the virtual time slope is determined by w_i (iow.
+-	 * nice) while the request time r_i is determined by
+-	 * sysctl_sched_base_slice.
+-	 */
+-	se->slice = sysctl_sched_base_slice;
+-
+ 	/*
+ 	 * EEVDF: vd_i = ve_i + r_i / w_i
+ 	 */
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index bc45beee335c5..8f8d903a01892 100644
+--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
+@@ -2520,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #endif
+
+extern void set_latency_fair(struct sched_entity *se, int prio);
+
+ #ifdef CONFIG_SCHED_HRTICK
+
+ /*
+--
+cgit
+
+From a317f35154852bc023a7ab2e3fa491e1897af72f Mon Sep 17 00:00:00 2001
+From: Vincent Guittot <vincent.guittot@linaro.org>
+Date: Fri, 24 Feb 2023 10:34:52 +0100
+Subject: sched/fair: Add sched group latency support
+
+Task can set its latency priority with sched_setattr(), which is then used
+to set the latency offset of its sched_enity, but sched group entities
+still have the default latency offset value.
+
+Add a latency.nice field in cpu cgroup controller to set the latency
+priority of the group similarly to sched_setattr(). The latency priority
+is then used to set the offset of the sched_entities of the group.
+
+Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Link: https://lkml.kernel.org/r/20230224093454.956298-7-vincent.guittot@linaro.org
+---
+ Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++
+ kernel/sched/core.c                     | 30 ++++++++++++++++++++++++++++++
+ kernel/sched/fair.c                     | 27 +++++++++++++++++++++++++++
+ kernel/sched/sched.h                    |  4 ++++
+ 4 files changed, 71 insertions(+)
+
+diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
+index 4ef8901911961..3a8d3e1e55910 100644
+--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
+@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
+         values similar to the sched_setattr(2). This maximum utilization
+         value is used to clamp the task specific maximum utilization clamp.
+
+  cpu.latency.nice
+	A read-write single value file which exists on non-root
+	cgroups.  The default is "0".
+
+	The nice value is in the range [-20, 19].
+
+	This interface file allows reading and setting latency using the
+	same values used by sched_setattr(2). The latency_nice of a group is
+	used to limit the impact of the latency_nice of a task outside the
+	group.
+
+
+ Memory
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 263caac8f76b7..8a541fe2d4626 100644
+--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
+@@ -11247,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ {
+ 	return sched_group_set_idle(css_tg(css), idle);
+ }
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+				    struct cftype *cft)
+{
+	return PRIO_TO_NICE(css_tg(css)->latency_prio);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+				     struct cftype *cft, s64 nice)
+{
+	int prio;
+
+	if (nice < MIN_NICE || nice > MAX_NICE)
+		return -ERANGE;
+
+	prio = NICE_TO_PRIO(nice);
+
+	return sched_group_set_latency(css_tg(css), prio);
+}
+ #endif
+
+ static struct cftype cpu_legacy_files[] = {
+@@ -11261,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = {
+ 		.read_s64 = cpu_idle_read_s64,
+ 		.write_s64 = cpu_idle_write_s64,
+ 	},
+	{
+		.name = "latency.nice",
+		.read_s64 = cpu_latency_nice_read_s64,
+		.write_s64 = cpu_latency_nice_write_s64,
+	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+@@ -11500,6 +11524,12 @@ static struct cftype cpu_files[] = {
+ 		.read_s64 = cpu_idle_read_s64,
+ 		.write_s64 = cpu_idle_write_s64,
+ 	},
+	{
+		.name = "latency.nice",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_latency_nice_read_s64,
+		.write_s64 = cpu_latency_nice_write_s64,
+	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index c2019e7d46cf5..8a4799c600309 100644
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -12545,6 +12545,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+ 		goto err;
+
+ 	tg->shares = NICE_0_LOAD;
+	tg->latency_prio = DEFAULT_PRIO;
+
+ 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+@@ -12643,6 +12644,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ 	}
+
+ 	se->my_q = cfs_rq;
+
+	set_latency_fair(se, tg->latency_prio - MAX_RT_PRIO);
+
+ 	/* guarantee group entities always have weight */
+ 	update_load_set(&se->load, NICE_0_LOAD);
+ 	se->parent = parent;
+@@ -12773,6 +12777,29 @@ next_cpu:
+ 	return 0;
+ }
+
+int sched_group_set_latency(struct task_group *tg, int prio)
+{
+	int i;
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	mutex_lock(&shares_mutex);
+
+	if (tg->latency_prio == prio) {
+		mutex_unlock(&shares_mutex);
+		return 0;
+	}
+
+	tg->latency_prio = prio;
+
+	for_each_possible_cpu(i)
+		set_latency_fair(tg->se[i], prio - MAX_RT_PRIO);
+
+	mutex_unlock(&shares_mutex);
+	return 0;
+}
+
+ #else /* CONFIG_FAIR_GROUP_SCHED */
+
+ void free_fair_sched_group(struct task_group *tg) { }
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 8f8d903a01892..4236c4c893aa7 100644
+--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
+@@ -372,6 +372,8 @@ struct task_group {
+
+ 	/* A positive value indicates that this is a SCHED_IDLE group. */
+ 	int			idle;
+	/* latency priority of the group. */
+	int			latency_prio;
+
+ #ifdef	CONFIG_SMP
+ 	/*
+@@ -482,6 +484,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+
+ extern int sched_group_set_idle(struct task_group *tg, long idle);
+
+extern int sched_group_set_latency(struct task_group *tg, int prio);
+
+ #ifdef CONFIG_SMP
+ extern void set_task_rq_fair(struct sched_entity *se,
+ 			     struct cfs_rq *prev, struct cfs_rq *next);
+--
+cgit
+
+From b412068f928064d23f67709f46d36d7659079e54 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Mon, 22 May 2023 13:46:30 +0200
+Subject: sched/eevdf: Use sched_attr::sched_runtime to set request/slice
+
+As an alternative to the latency-nice interface; allow applications to
+directly set the request/slice using sched_attr::sched_runtime.
+
+The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
+which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
+
+Applications should strive to use their periodic runtime at a high
+confidence interval (95%+) as the target slice. Using a smaller slice
+will introduce undue preemptions, while using a larger value will
+increase latency.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+---
+ kernel/sched/core.c | 24 ++++++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 8a541fe2d4626..5b71c398f6cf6 100644
+--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
+@@ -7548,10 +7548,18 @@ static void __setscheduler_params(struct task_struct *p,
+
+ 	p->policy = policy;
+
+-	if (dl_policy(policy))
+	if (dl_policy(policy)) {
+ 		__setparam_dl(p, attr);
+-	else if (fair_policy(policy))
+	} else if (fair_policy(policy)) {
+ 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+		if (attr->sched_runtime) {
+			p->se.slice = clamp_t(u64, attr->sched_runtime,
+					      NSEC_PER_MSEC/10,   /* HZ=1000 * 10 */
+					      NSEC_PER_MSEC*100); /* HZ=100  / 10 */
+		} else {
+			p->se.slice = sysctl_sched_base_slice;
+		}
+	}
+
+ 	/*
+ 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
+@@ -7750,7 +7758,9 @@ recheck:
+ 	 * but store a possible modification of reset_on_fork.
+ 	 */
+ 	if (unlikely(policy == p->policy)) {
+-		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+		if (fair_policy(policy) &&
+		    (attr->sched_nice != task_nice(p) ||
+		     (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
+ 			goto change;
+ 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ 			goto change;
+@@ -8079,12 +8089,14 @@ err_size:
+
+ static void get_params(struct task_struct *p, struct sched_attr *attr)
+ {
+-	if (task_has_dl_policy(p))
+	if (task_has_dl_policy(p)) {
+ 		__getparam_dl(p, attr);
+-	else if (task_has_rt_policy(p))
+	} else if (task_has_rt_policy(p)) {
+ 		attr->sched_priority = p->rt_priority;
+-	else
+	} else {
+ 		attr->sched_nice = task_nice(p);
+		attr->sched_runtime = p->se.slice;
+	}
+ }
+
+ /**
+--
+cgit
+
+From 2f88c8e802c8b128a155976631f4eb2ce4f3c805 Mon Sep 17 00:00:00 2001
+From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
+Date: Thu, 24 Aug 2023 13:33:42 +0530
+Subject: sched/eevdf/doc: Modify the documented knob to base_slice_ns as well
+
+After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns'
+sysctl to 'base_slice_ns':
+
+   e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
+
+... but we forgot to rename it in the documentation. Do that now.
+
+Fixes: e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
+Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20230824080342.543396-1-sshegde@linux.vnet.ibm.com
+---
+ Documentation/scheduler/sched-design-CFS.rst | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
+index 03db555045151..f68919800f050 100644
+--- a/Documentation/scheduler/sched-design-CFS.rst
+++ b/Documentation/scheduler/sched-design-CFS.rst
+@@ -94,7 +94,7 @@ other HZ detail.  Thus the CFS scheduler has no notion of "timeslices" in the
+ way the previous scheduler had, and has no heuristics whatsoever.  There is
+ only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
+
+-   /sys/kernel/debug/sched/min_granularity_ns
+   /sys/kernel/debug/sched/base_slice_ns
+
+ which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
+ "server" (i.e., good batching) workloads.  It defaults to a setting suitable
+--
+cgit
+
+From 63304558ba5dcaaff9e052ee43cfdcc7f9c29e85 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 16 Aug 2023 15:40:59 +0200
+Subject: sched/eevdf: Curb wakeup-preemption
+
+Mike and others noticed that EEVDF does like to over-schedule quite a
+bit -- which does hurt performance of a number of benchmarks /
+workloads.
+
+In particular, what seems to cause over-scheduling is that when lag is
+of the same order (or larger) than the request / slice then placement
+will not only cause the task to be placed left of current, but also
+with a smaller deadline than current, which causes immediate
+preemption.
+
+[ notably, lag bounds are relative to HZ ]
+
+Mike suggested we stick to picking 'current' for as long as it's
+eligible to run, giving it uninterrupted runtime until it reaches
+parity with the pack.
+
+Augment Mike's suggestion by only allowing it to exhaust it's initial
+request.
+
+One random data point:
+
+echo NO_RUN_TO_PARITY > /debug/sched/features
+perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
+
+	3,723,554        context-switches      ( +-  0.56% )
+	9.5136 +- 0.0394 seconds time elapsed  ( +-  0.41% )
+
+echo RUN_TO_PARITY > /debug/sched/features
+perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
+
+	2,556,535        context-switches      ( +-  0.51% )
+	9.2427 +- 0.0302 seconds time elapsed  ( +-  0.33% )
+
+Suggested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net
+---
+ kernel/sched/fair.c     | 12 ++++++++++++
+ kernel/sched/features.h |  1 +
+ 2 files changed, 13 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index f496cef90ce77..0b7445cd5af98 100644
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -873,6 +873,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+ 	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ 		curr = NULL;
+
+	/*
+	 * Once selected, run a task until it either becomes non-eligible or
+	 * until it gets a new slice. See the HACK in set_next_entity().
+	 */
+	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+		return curr;
+
+ 	while (node) {
+ 		struct sched_entity *se = __node_2_se(node);
+
+@@ -5167,6 +5174,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 		update_stats_wait_end_fair(cfs_rq, se);
+ 		__dequeue_entity(cfs_rq, se);
+ 		update_load_avg(cfs_rq, se, UPDATE_TG);
+		/*
+		 * HACK, stash a copy of deadline at the point of pick in vlag,
+		 * which isn't used until dequeue.
+		 */
+		se->vlag = se->deadline;
+ 	}
+
+ 	update_stats_curr_start(cfs_rq, se);
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 61bcbf5e46a45..f770168230ae4 100644
+--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
+@@ -6,6 +6,7 @@
+  */
+ SCHED_FEAT(PLACE_LAG, true)
+ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+SCHED_FEAT(RUN_TO_PARITY, true)
+
+ /*
+  * Prefer to schedule the task we woke last (assuming it failed
+--
+cgit
+