diff --git a/linux-tkg-patches/6.5/0003-eevdf.patch b/linux-tkg-patches/6.5/0003-eevdf.patch
index c73f78f..b52b29c 100644
--- a/linux-tkg-patches/6.5/0003-eevdf.patch
+++ b/linux-tkg-patches/6.5/0003-eevdf.patch
@@ -1,37 +1,526 @@
-From af4cf40470c22efa3987200fd19478199e08e103 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 31 May 2023 13:58:40 +0200
-Subject: sched/fair: Add cfs_rq::avg_vruntime
+From f40e0ab8e18aa28a5a37e0a7574559f2a914d697 Mon Sep 17 00:00:00 2001
+From: Piotr Gorski <lucjan.lucjanov@gmail.com>
+Date: Fri, 15 Sep 2023 18:05:03 +0200
+Subject: [PATCH] sched-6.5: Introduce EEVDF
 
-In order to move to an eligibility based scheduling policy, we need
-to have a better approximation of the ideal scheduler.
-
-Specifically, for a virtual time weighted fair queueing based
-scheduler the ideal scheduler will be the weighted average of the
-individual virtual runtimes (math in the comment).
-
-As such, compute the weighted average to approximate the ideal
-scheduler -- note that the approximation is in the individual task
-behaviour, which isn't strictly conformant.
-
-Specifically consider adding a task with a vruntime left of center, in
-this case the average will move backwards in time -- something the
-ideal scheduler would of course never do.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20230531124603.654144274@infradead.org
+Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
 ---
- kernel/sched/debug.c |  32 ++++++------
- kernel/sched/fair.c  | 137 +++++++++++++++++++++++++++++++++++++++++++++++++--
- kernel/sched/sched.h |   5 ++
- 3 files changed, 154 insertions(+), 20 deletions(-)
+ Documentation/admin-guide/cgroup-v2.rst      |   10 +
+ Documentation/scheduler/sched-design-CFS.rst |    2 +-
+ include/linux/rbtree_augmented.h             |   26 +
+ include/linux/sched.h                        |    9 +-
+ include/uapi/linux/sched.h                   |    4 +-
+ include/uapi/linux/sched/types.h             |   19 +
+ init/init_task.c                             |    3 +-
+ kernel/sched/core.c                          |  130 +-
+ kernel/sched/debug.c                         |   49 +-
+ kernel/sched/fair.c                          | 1141 +++++++++---------
+ kernel/sched/features.h                      |   25 +-
+ kernel/sched/sched.h                         |   24 +-
+ tools/include/uapi/linux/sched.h             |    4 +-
+ 13 files changed, 783 insertions(+), 663 deletions(-)
 
+diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
+index 4ef890191..3a8d3e1e5 100644
+--- a/Documentation/admin-guide/cgroup-v2.rst
++++ b/Documentation/admin-guide/cgroup-v2.rst
+@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
+         values similar to the sched_setattr(2). This maximum utilization
+         value is used to clamp the task specific maximum utilization clamp.
+
++  cpu.latency.nice
++	A read-write single value file which exists on non-root
++	cgroups.  The default is "0".
++
++	The nice value is in the range [-20, 19].
++
++	This interface file allows reading and setting latency using the
++	same values used by sched_setattr(2). The latency_nice of a group is
++	used to limit the impact of the latency_nice of a task outside the
++	group.
+
+
+ Memory
+diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
+index 03db55504..f68919800 100644
+--- a/Documentation/scheduler/sched-design-CFS.rst
++++ b/Documentation/scheduler/sched-design-CFS.rst
+@@ -94,7 +94,7 @@ other HZ detail.  Thus the CFS scheduler has no notion of "timeslices" in the
+ way the previous scheduler had, and has no heuristics whatsoever.  There is
+ only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
+
+-   /sys/kernel/debug/sched/min_granularity_ns
++   /sys/kernel/debug/sched/base_slice_ns
+
+ which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
+ "server" (i.e., good batching) workloads.  It defaults to a setting suitable
+diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
+index 7ee7ed5de..6dbc5a1bf 100644
+--- a/include/linux/rbtree_augmented.h
++++ b/include/linux/rbtree_augmented.h
+@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node,
+ 	rb_insert_augmented(node, &root->rb_root, augment);
+ }
+
++static __always_inline struct rb_node *
++rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
++			bool (*less)(struct rb_node *, const struct rb_node *),
++			const struct rb_augment_callbacks *augment)
++{
++	struct rb_node **link = &tree->rb_root.rb_node;
++	struct rb_node *parent = NULL;
++	bool leftmost = true;
++
++	while (*link) {
++		parent = *link;
++		if (less(node, parent)) {
++			link = &parent->rb_left;
++		} else {
++			link = &parent->rb_right;
++			leftmost = false;
++		}
++	}
++
++	rb_link_node(node, parent, link);
++	augment->propagate(parent, NULL); /* suboptimal */
++	rb_insert_augmented_cached(node, tree, leftmost, augment);
++
++	return leftmost ? node : NULL;
++}
++
+ /*
+  * Template for declaring augmented rbtree callbacks (generic case)
+  *
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 609bde814..e6f3a5e38 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -549,13 +549,18 @@ struct sched_entity {
+ 	/* For load-balancing: */
+ 	struct load_weight		load;
+ 	struct rb_node			run_node;
++	u64				deadline;
++	u64				min_deadline;
++
+ 	struct list_head		group_node;
+ 	unsigned int			on_rq;
+
+ 	u64				exec_start;
+ 	u64				sum_exec_runtime;
+-	u64				vruntime;
+ 	u64				prev_sum_exec_runtime;
++	u64				vruntime;
++	s64				vlag;
++	u64				slice;
+
+ 	u64				nr_migrations;
+
+@@ -785,6 +790,7 @@ struct task_struct {
+ 	int				static_prio;
+ 	int				normal_prio;
+ 	unsigned int			rt_priority;
++	int				latency_prio;
+
+ 	struct sched_entity		se;
+ 	struct sched_rt_entity		rt;
+@@ -886,6 +892,7 @@ struct task_struct {
+ 	unsigned			sched_reset_on_fork:1;
+ 	unsigned			sched_contributes_to_load:1;
+ 	unsigned			sched_migrated:1;
++	unsigned			sched_delayed:1;
+
+ 	/* Force alignment to the next boundary: */
+ 	unsigned			:0;
+diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
+index 3bac0a8ce..b2e932c25 100644
+--- a/include/uapi/linux/sched.h
++++ b/include/uapi/linux/sched.h
+@@ -132,6 +132,7 @@ struct clone_args {
+ #define SCHED_FLAG_KEEP_PARAMS		0x10
+ #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+ #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
++#define SCHED_FLAG_LATENCY_NICE		0x80
+
+ #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+ 				 SCHED_FLAG_KEEP_PARAMS)
+@@ -143,6 +144,7 @@ struct clone_args {
+ 			 SCHED_FLAG_RECLAIM		| \
+ 			 SCHED_FLAG_DL_OVERRUN		| \
+ 			 SCHED_FLAG_KEEP_ALL		| \
+-			 SCHED_FLAG_UTIL_CLAMP)
++			 SCHED_FLAG_UTIL_CLAMP		| \
++			 SCHED_FLAG_LATENCY_NICE)
+
+ #endif /* _UAPI_LINUX_SCHED_H */
+diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
+index f2c4589d4..db1e8199e 100644
+--- a/include/uapi/linux/sched/types.h
++++ b/include/uapi/linux/sched/types.h
+@@ -10,6 +10,7 @@ struct sched_param {
+
+ #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
+ #define SCHED_ATTR_SIZE_VER1	56	/* add: util_{min,max} */
++#define SCHED_ATTR_SIZE_VER2	60	/* add: latency_nice */
+
+ /*
+  * Extended scheduling parameters data structure.
+@@ -98,6 +99,22 @@ struct sched_param {
+  * scheduled on a CPU with no more capacity than the specified value.
+  *
+  * A task utilization boundary can be reset by setting the attribute to -1.
++ *
++ * Latency Tolerance Attributes
++ * ===========================
++ *
++ * A subset of sched_attr attributes allows to specify the relative latency
++ * requirements of a task with respect to the other tasks running/queued in the
++ * system.
++ *
++ * @ sched_latency_nice	task's latency_nice value
++ *
++ * The latency_nice of a task can have any value in a range of
++ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
++ *
++ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
++ * taken for a task requiring a lower latency as opposed to the task with
++ * higher latency_nice.
+  */
+ struct sched_attr {
+ 	__u32 size;
+@@ -120,6 +137,8 @@ struct sched_attr {
+ 	__u32 sched_util_min;
+ 	__u32 sched_util_max;
+
++	/* latency requirement hints */
++	__s32 sched_latency_nice;
+ };
+
+ #endif /* _UAPI_LINUX_SCHED_TYPES_H */
+diff --git a/init/init_task.c b/init/init_task.c
+index ff6c4b9bf..511cbcf35 100644
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -78,6 +78,7 @@ struct task_struct init_task
+ 	.prio		= MAX_PRIO - 20,
+ 	.static_prio	= MAX_PRIO - 20,
+ 	.normal_prio	= MAX_PRIO - 20,
++	.latency_prio	= DEFAULT_PRIO,
+ 	.policy		= SCHED_NORMAL,
+ 	.cpus_ptr	= &init_task.cpus_mask,
+ 	.user_cpus_ptr	= NULL,
+@@ -89,7 +90,7 @@ struct task_struct init_task
+ 		.fn = do_no_restart_syscall,
+ 	},
+ 	.se		= {
+-		.group_node 	= LIST_HEAD_INIT(init_task.se.group_node),
++		.group_node	= LIST_HEAD_INIT(init_task.se.group_node),
+ 	},
+ 	.rt		= {
+ 		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index c52c2eba7..095c46027 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
+ 	}
+ }
+
++static inline void set_latency_prio(struct task_struct *p, int prio)
++{
++	p->latency_prio = prio;
++	set_latency_fair(&p->se, prio - MAX_RT_PRIO);
++}
++
+ #ifdef CONFIG_UCLAMP_TASK
+ /*
+  * Serializes updates of utilization clamp values
+@@ -4501,8 +4507,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.prev_sum_exec_runtime	= 0;
+ 	p->se.nr_migrations		= 0;
+ 	p->se.vruntime			= 0;
++	p->se.vlag			= 0;
+ 	INIT_LIST_HEAD(&p->se.group_node);
+
++	set_latency_prio(p, p->latency_prio);
++
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	p->se.cfs_rq			= NULL;
+ #endif
+@@ -4754,6 +4763,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+
+ 		p->prio = p->normal_prio = p->static_prio;
+ 		set_load_weight(p, false);
++		set_latency_prio(p, NICE_TO_PRIO(0));
+
+ 		/*
+ 		 * We don't need the reset flag anymore after the fork. It has
+@@ -6549,6 +6559,16 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ # define SM_MASK_PREEMPT	SM_PREEMPT
+ #endif
+
++static void __deschedule_task(struct rq *rq, struct task_struct *p)
++{
++	deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
++
++	if (p->in_iowait) {
++		atomic_inc(&rq->nr_iowait);
++		delayacct_blkio_start();
++	}
++}
++
+ /*
+  * __schedule() is the main scheduler function.
+  *
+@@ -6661,17 +6681,36 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+ 			 *
+ 			 * After this, schedule() must not care about p->state any more.
+ 			 */
+-			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
++			if (!(sched_feat(DELAY_DEQUEUE) &&
++			      prev->sched_class->eligible_task &&
++			      !prev->sched_class->eligible_task(rq, prev)))
++				__deschedule_task(rq, prev);
++			else
++				prev->sched_delayed = 1;
++		}
++		switch_count = &prev->nvcsw;
++	}
++
++	for (struct task_struct *tmp = prev;;) {
+
+-			if (prev->in_iowait) {
+-				atomic_inc(&rq->nr_iowait);
+-				delayacct_blkio_start();
++		next = pick_next_task(rq, tmp, &rf);
++		if (unlikely(tmp != prev))
++			finish_task(tmp);
++
++		if (sched_feat(DELAY_DEQUEUE) && unlikely(next->sched_delayed)) {
++			next->sched_delayed = 0;
++			if (READ_ONCE(next->__state)) {
++				prepare_task(next);
++				smp_wmb();
++				__deschedule_task(rq, next);
++				tmp = next;
++				continue;
+ 			}
+ 		}
+-		switch_count = &prev->nvcsw;
++
++		break;
+ 	}
+
+-	next = pick_next_task(rq, prev, &rf);
+ 	clear_tsk_need_resched(prev);
+ 	clear_preempt_need_resched();
+ #ifdef CONFIG_SCHED_DEBUG
+@@ -7516,7 +7555,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
+ #define SETPARAM_POLICY	-1
+
+ static void __setscheduler_params(struct task_struct *p,
+-		const struct sched_attr *attr)
++				  const struct sched_attr *attr)
+ {
+ 	int policy = attr->sched_policy;
+
+@@ -7525,10 +7564,18 @@ static void __setscheduler_params(struct task_struct *p,
+
+ 	p->policy = policy;
+
+-	if (dl_policy(policy))
++	if (dl_policy(policy)) {
+ 		__setparam_dl(p, attr);
+-	else if (fair_policy(policy))
++	} else if (fair_policy(policy)) {
+ 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
++		if (attr->sched_runtime) {
++			p->se.slice = clamp_t(u64, attr->sched_runtime,
++					      NSEC_PER_MSEC/10,   /* HZ=1000 * 10 */
++					      NSEC_PER_MSEC*100); /* HZ=100  / 10 */
++		} else {
++			p->se.slice = sysctl_sched_base_slice;
++		}
++	}
+
+ 	/*
+ 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
+@@ -7540,6 +7587,13 @@ static void __setscheduler_params(struct task_struct *p,
+ 	set_load_weight(p, true);
+ }
+
++static void __setscheduler_latency(struct task_struct *p,
++				   const struct sched_attr *attr)
++{
++	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
++		set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
++}
++
+ /*
+  * Check the target process has a UID that matches the current process's:
+  */
+@@ -7674,6 +7728,13 @@ static int __sched_setscheduler(struct task_struct *p,
+ 			return retval;
+ 	}
+
++	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
++		if (attr->sched_latency_nice > MAX_NICE)
++			return -EINVAL;
++		if (attr->sched_latency_nice < MIN_NICE)
++			return -EINVAL;
++	}
++
+ 	/* Update task specific "requested" clamps */
+ 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ 		retval = uclamp_validate(p, attr);
+@@ -7713,7 +7774,9 @@ static int __sched_setscheduler(struct task_struct *p,
+ 	 * but store a possible modification of reset_on_fork.
+ 	 */
+ 	if (unlikely(policy == p->policy)) {
+-		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
++		if (fair_policy(policy) &&
++		    (attr->sched_nice != task_nice(p) ||
++		     (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
+ 			goto change;
+ 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ 			goto change;
+@@ -7721,6 +7784,9 @@ static int __sched_setscheduler(struct task_struct *p,
+ 			goto change;
+ 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ 			goto change;
++		if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
++		    attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
++			goto change;
+
+ 		p->sched_reset_on_fork = reset_on_fork;
+ 		retval = 0;
+@@ -7809,6 +7875,7 @@ static int __sched_setscheduler(struct task_struct *p,
+ 		__setscheduler_params(p, attr);
+ 		__setscheduler_prio(p, newprio);
+ 	}
++	__setscheduler_latency(p, attr);
+ 	__setscheduler_uclamp(p, attr);
+
+ 	if (queued) {
+@@ -8020,6 +8087,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
+ 	    size < SCHED_ATTR_SIZE_VER1)
+ 		return -EINVAL;
+
++	if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
++	    size < SCHED_ATTR_SIZE_VER2)
++		return -EINVAL;
+ 	/*
+ 	 * XXX: Do we want to be lenient like existing syscalls; or do we want
+ 	 * to be strict and return an error on out-of-bounds values?
+@@ -8035,12 +8105,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
+
+ static void get_params(struct task_struct *p, struct sched_attr *attr)
+ {
+-	if (task_has_dl_policy(p))
++	if (task_has_dl_policy(p)) {
+ 		__getparam_dl(p, attr);
+-	else if (task_has_rt_policy(p))
++	} else if (task_has_rt_policy(p)) {
+ 		attr->sched_priority = p->rt_priority;
+-	else
++	} else {
+ 		attr->sched_nice = task_nice(p);
++		attr->sched_runtime = p->se.slice;
++	}
+ }
+
+ /**
+@@ -8257,6 +8329,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ 	get_params(p, &kattr);
+ 	kattr.sched_flags &= SCHED_FLAG_ALL;
+
++	kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio);
++
+ #ifdef CONFIG_UCLAMP_TASK
+ 	/*
+ 	 * This could race with another potential updater, but this is fine
+@@ -11180,6 +11254,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ {
+ 	return sched_group_set_idle(css_tg(css), idle);
+ }
++
++static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
++				    struct cftype *cft)
++{
++	return PRIO_TO_NICE(css_tg(css)->latency_prio);
++}
++
++static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
++				     struct cftype *cft, s64 nice)
++{
++	int prio;
++
++	if (nice < MIN_NICE || nice > MAX_NICE)
++		return -ERANGE;
++
++	prio = NICE_TO_PRIO(nice);
++
++	return sched_group_set_latency(css_tg(css), prio);
++}
+ #endif
+
+ static struct cftype cpu_legacy_files[] = {
+@@ -11194,6 +11287,11 @@ static struct cftype cpu_legacy_files[] = {
+ 		.read_s64 = cpu_idle_read_s64,
+ 		.write_s64 = cpu_idle_write_s64,
+ 	},
++	{
++		.name = "latency.nice",
++		.read_s64 = cpu_latency_nice_read_s64,
++		.write_s64 = cpu_latency_nice_write_s64,
++	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+@@ -11411,6 +11509,12 @@ static struct cftype cpu_files[] = {
+ 		.read_s64 = cpu_idle_read_s64,
+ 		.write_s64 = cpu_idle_write_s64,
+ 	},
++	{
++		.name = "latency.nice",
++		.flags = CFTYPE_NOT_ON_ROOT,
++		.read_s64 = cpu_latency_nice_read_s64,
++		.write_s64 = cpu_latency_nice_write_s64,
++	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index aeeba46a096b9..e48d2b2db7bca 100644
+index 066ff1c8a..e7e83181f 100644
 --- a/kernel/sched/debug.c
 +++ b/kernel/sched/debug.c
-@@ -627,10 +627,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+@@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+ #endif
+
+-	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
+-	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+-	debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
+-	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
++	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
+
+ 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+ 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+@@ -581,9 +578,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+ 	else
+ 		SEQ_printf(m, " %c", task_state_to_char(p));
+
+-	SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
++	SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
+ 		p->comm, task_pid_nr(p),
+ 		SPLIT_NS(p->se.vruntime),
++		entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
++		SPLIT_NS(p->se.deadline),
++		SPLIT_NS(p->se.slice),
++		SPLIT_NS(p->se.sum_exec_runtime),
+ 		(long long)(p->nvcsw + p->nivcsw),
+ 		p->prio);
+
+@@ -626,10 +627,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 
  void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
  {
@@ -44,7 +533,7 @@ index aeeba46a096b9..e48d2b2db7bca 100644
  	unsigned long flags;
 
  #ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -644,26 +643,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+@@ -643,26 +643,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
  			SPLIT_NS(cfs_rq->exec_clock));
 
  	raw_spin_rq_lock_irqsave(rq, flags);
@@ -84,11 +573,133 @@ index aeeba46a096b9..e48d2b2db7bca 100644
  	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
  			cfs_rq->nr_spread_over);
  	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+@@ -863,10 +862,7 @@ static void sched_debug_header(struct seq_file *m)
+ 	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
+ #define PN(x) \
+ 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+-	PN(sysctl_sched_latency);
+-	PN(sysctl_sched_min_granularity);
+-	PN(sysctl_sched_idle_min_granularity);
+-	PN(sysctl_sched_wakeup_granularity);
++	PN(sysctl_sched_base_slice);
+ 	P(sysctl_sched_child_runs_first);
+ 	P(sysctl_sched_features);
+ #undef PN
+@@ -1089,6 +1085,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ #endif
+ 	P(policy);
+ 	P(prio);
++	P(latency_prio);
+ 	if (task_has_dl_policy(p)) {
+ 		P(dl.runtime);
+ 		P(dl.deadline);
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index d3df5b1642a6f..bb5460682ae2e 100644
+index 1d9c2482c..4f23f545e 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
-@@ -601,9 +601,134 @@ static inline bool entity_before(const struct sched_entity *a,
+@@ -47,6 +47,7 @@
+ #include <linux/psi.h>
+ #include <linux/ratelimit.h>
+ #include <linux/task_work.h>
++#include <linux/rbtree_augmented.h>
+
+ #include <asm/switch_to.h>
+
+@@ -56,22 +57,6 @@
+ #include "stats.h"
+ #include "autogroup.h"
+
+-/*
+- * Targeted preemption latency for CPU-bound tasks:
+- *
+- * NOTE: this latency value is not the same as the concept of
+- * 'timeslice length' - timeslices in CFS are of variable length
+- * and have no persistent notion like in traditional, time-slice
+- * based scheduling concepts.
+- *
+- * (to see the precise effective timeslice length of your workload,
+- *  run vmstat and monitor the context-switches (cs) field)
+- *
+- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
+- */
+-unsigned int sysctl_sched_latency			= 6000000ULL;
+-static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
+-
+ /*
+  * The initial- and re-scaling of tunables is configurable
+  *
+@@ -90,21 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+  *
+  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+  */
+-unsigned int sysctl_sched_min_granularity			= 750000ULL;
+-static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
+-
+-/*
+- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
+- * Applies only when SCHED_IDLE tasks compete with normal tasks.
+- *
+- * (default: 0.75 msec)
+- */
+-unsigned int sysctl_sched_idle_min_granularity			= 750000ULL;
+-
+-/*
+- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
+- */
+-static unsigned int sched_nr_latency = 8;
++unsigned int sysctl_sched_base_slice			= 750000ULL;
++static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
+
+ /*
+  * After fork, child runs first. If set to 0 (default) then
+@@ -112,18 +84,6 @@ static unsigned int sched_nr_latency = 8;
+  */
+ unsigned int sysctl_sched_child_runs_first __read_mostly;
+
+-/*
+- * SCHED_OTHER wake-up granularity.
+- *
+- * This option delays the preemption effects of decoupled workloads
+- * and reduces their over-scheduling. Synchronous workloads will still
+- * have immediate wakeup/sleep latencies.
+- *
+- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+- */
+-unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
+-static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
+-
+ const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+
+ int sched_thermal_decay_shift;
+@@ -277,9 +237,7 @@ static void update_sysctl(void)
+
+ #define SET_SYSCTL(name) \
+ 	(sysctl_##name = (factor) * normalized_sysctl_##name)
+-	SET_SYSCTL(sched_min_granularity);
+-	SET_SYSCTL(sched_latency);
+-	SET_SYSCTL(sched_wakeup_granularity);
++	SET_SYSCTL(sched_base_slice);
+ #undef SET_SYSCTL
+ }
+
+@@ -347,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
+ 	return mul_u64_u32_shr(delta_exec, fact, shift);
+ }
+
++/*
++ * delta /= w
++ */
++static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
++{
++	if (unlikely(se->load.weight != NICE_0_LOAD))
++		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
++
++	return delta;
++}
+
+ const struct sched_class fair_sched_class;
+
+@@ -601,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a,
  	return (s64)(a->vruntime - b->vruntime) < 0;
  }
 
@@ -206,6 +817,66 @@ index d3df5b1642a6f..bb5460682ae2e 100644
 +	return cfs_rq->min_vruntime + avg;
 +}
 +
++/*
++ * lag_i = S - s_i = w_i * (V - v_i)
++ *
++ * However, since V is approximated by the weighted average of all entities it
++ * is possible -- by addition/removal/reweight to the tree -- to move V around
++ * and end up with a larger lag than we started with.
++ *
++ * Limit this to either double the slice length with a minimum of TICK_NSEC
++ * since that is the timing granularity.
++ *
++ * EEVDF gives the following limit for a steady state system:
++ *
++ *   -r_max < lag < max(r_max, q)
++ *
++ * XXX could add max_slice to the augmented data to track this.
++ */
++void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
++{
++	s64 lag, limit;
++
++	SCHED_WARN_ON(!se->on_rq);
++	lag = avg_vruntime(cfs_rq) - se->vruntime;
++
++	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
++	se->vlag = clamp(lag, -limit, limit);
++}
++
++/*
++ * Entity is eligible once it received less service than it ought to have,
++ * eg. lag >= 0.
++ *
++ * lag_i = S - s_i = w_i*(V - v_i)
++ *
++ * lag_i >= 0 -> V >= v_i
++ *
++ *     \Sum (v_i - v)*w_i
++ * V = ------------------ + v
++ *          \Sum w_i
++ *
++ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
++ *
++ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
++ *       to the loss in precision caused by the division.
++ */
++int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
++{
++	struct sched_entity *curr = cfs_rq->curr;
++	s64 avg = cfs_rq->avg_vruntime;
++	long load = cfs_rq->avg_load;
++
++	if (curr && curr->on_rq) {
++		unsigned long weight = scale_load_down(curr->load.weight);
++
++		avg += entity_key(cfs_rq, curr) * weight;
++		load += weight;
++	}
++
++	return avg >= entity_key(cfs_rq, se) * load;
++}
++
 +static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
 +{
 +	u64 min_vruntime = cfs_rq->min_vruntime;
@@ -222,8 +893,24 @@ index d3df5b1642a6f..bb5460682ae2e 100644
 +
  static void update_min_vruntime(struct cfs_rq *cfs_rq)
  {
++	struct sched_entity *se = __pick_first_entity(cfs_rq);
  	struct sched_entity *curr = cfs_rq->curr;
-@@ -629,7 +754,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
+-	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
+
+ 	u64 vruntime = cfs_rq->min_vruntime;
+
+@@ -618,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
+ 			curr = NULL;
+ 	}
+
+-	if (leftmost) { /* non-empty tree */
+-		struct sched_entity *se = __node_2_se(leftmost);
+-
++	if (se) {
+ 		if (!curr)
+ 			vruntime = se->vruntime;
+ 		else
+@@ -629,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 
  	/* ensure we never gain time by being placed backwards. */
  	u64_u32_store(cfs_rq->min_vruntime,
@@ -232,22 +919,302 @@ index d3df5b1642a6f..bb5460682ae2e 100644
  }
 
  static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
-@@ -642,12 +767,14 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+@@ -637,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+ 	return entity_before(__node_2_se(a), __node_2_se(b));
+ }
+
++#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
++
++static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
++{
++	if (node) {
++		struct sched_entity *rse = __node_2_se(node);
++		if (deadline_gt(min_deadline, se, rse))
++			se->min_deadline = rse->min_deadline;
++	}
++}
++
++/*
++ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
++ */
++static inline bool min_deadline_update(struct sched_entity *se, bool exit)
++{
++	u64 old_min_deadline = se->min_deadline;
++	struct rb_node *node = &se->run_node;
++
++	se->min_deadline = se->deadline;
++	__update_min_deadline(se, node->rb_right);
++	__update_min_deadline(se, node->rb_left);
++
++	return se->min_deadline == old_min_deadline;
++}
++
++RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
++		     run_node, min_deadline, min_deadline_update);
++
+ /*
+  * Enqueue an entity into the rb-tree:
   */
  static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
+-	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
 +	avg_vruntime_add(cfs_rq, se);
- 	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
++	se->min_deadline = se->deadline;
++	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
++				__entity_less, &min_deadline_cb);
  }
 
  static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
- 	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+-	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
++	rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
++				  &min_deadline_cb);
 +	avg_vruntime_sub(cfs_rq, se);
  }
 
  struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
-@@ -3379,6 +3506,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+@@ -660,14 +845,88 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+ 	return __node_2_se(left);
+ }
+
+-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
++/*
++ * Earliest Eligible Virtual Deadline First
++ *
++ * In order to provide latency guarantees for different request sizes
++ * EEVDF selects the best runnable task from two criteria:
++ *
++ *  1) the task must be eligible (must be owed service)
++ *
++ *  2) from those tasks that meet 1), we select the one
++ *     with the earliest virtual deadline.
++ *
++ * We can do this in O(log n) time due to an augmented RB-tree. The
++ * tree keeps the entries sorted on service, but also functions as a
++ * heap based on the deadline by keeping:
++ *
++ *  se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
++ *
++ * Which allows an EDF like search on (sub)trees.
++ */
++static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+ {
+-	struct rb_node *next = rb_next(&se->run_node);
++	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
++	struct sched_entity *curr = cfs_rq->curr;
++	struct sched_entity *best = NULL;
+
+-	if (!next)
+-		return NULL;
++	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
++		curr = NULL;
++
++	/*
++	 * Once selected, run a task until it either becomes non-eligible or
++	 * until it gets a new slice. See the HACK in set_next_entity().
++	 */
++	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
++		return curr;
++
++	while (node) {
++		struct sched_entity *se = __node_2_se(node);
++
++		/*
++		 * If this entity is not eligible, try the left subtree.
++		 */
++		if (!entity_eligible(cfs_rq, se)) {
++			node = node->rb_left;
++			continue;
++		}
++
++		/*
++		 * If this entity has an earlier deadline than the previous
++		 * best, take this one. If it also has the earliest deadline
++		 * of its subtree, we're done.
++		 */
++		if (!best || deadline_gt(deadline, best, se)) {
++			best = se;
++			if (best->deadline == best->min_deadline)
++				break;
++		}
++
++		/*
++		 * If the earlest deadline in this subtree is in the fully
++		 * eligible left half of our space, go there.
++		 */
++		if (node->rb_left &&
++		    __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
++			node = node->rb_left;
++			continue;
++		}
++
++		node = node->rb_right;
++	}
++
++	if (!best || (curr && deadline_gt(deadline, best, curr)))
++		best = curr;
+
+-	return __node_2_se(next);
++	if (unlikely(!best)) {
++		struct sched_entity *left = __pick_first_entity(cfs_rq);
++		if (left) {
++			pr_err("EEVDF scheduling fail, picking leftmost\n");
++			return left;
++		}
++	}
++
++	return best;
+ }
+
+ #ifdef CONFIG_SCHED_DEBUG
+@@ -689,104 +948,53 @@ int sched_update_scaling(void)
+ {
+ 	unsigned int factor = get_update_sysctl_factor();
+
+-	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
+-					sysctl_sched_min_granularity);
+-
+ #define WRT_SYSCTL(name) \
+ 	(normalized_sysctl_##name = sysctl_##name / (factor))
+-	WRT_SYSCTL(sched_min_granularity);
+-	WRT_SYSCTL(sched_latency);
+-	WRT_SYSCTL(sched_wakeup_granularity);
++	WRT_SYSCTL(sched_base_slice);
+ #undef WRT_SYSCTL
+
+ 	return 0;
+ }
+ #endif
+
+-/*
+- * delta /= w
+- */
+-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
++void set_latency_fair(struct sched_entity *se, int prio)
+ {
+-	if (unlikely(se->load.weight != NICE_0_LOAD))
+-		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
++	u32 weight = sched_prio_to_weight[prio];
++	u64 base = sysctl_sched_base_slice;
+
+-	return delta;
+-}
+-
+-/*
+- * The idea is to set a period in which each task runs once.
+- *
+- * When there are too many tasks (sched_nr_latency) we have to stretch
+- * this period because otherwise the slices get too small.
+- *
+- * p = (nr <= nl) ? l : l*nr/nl
+- */
+-static u64 __sched_period(unsigned long nr_running)
+-{
+-	if (unlikely(nr_running > sched_nr_latency))
+-		return nr_running * sysctl_sched_min_granularity;
+-	else
+-		return sysctl_sched_latency;
++	/*
++	 * For EEVDF the virtual time slope is determined by w_i (iow.
++	 * nice) while the request time r_i is determined by
++	 * latency-nice.
++	 *
++	 * Smaller request gets better latency.
++	 */
++	se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
+ }
+
+-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
++static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
+ /*
+- * We calculate the wall-time slice from the period by taking a part
+- * proportional to the weight.
+- *
+- * s = p*P[w/rw]
++ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
++ * this is probably good enough.
+  */
+-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
++static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+-	unsigned int nr_running = cfs_rq->nr_running;
+-	struct sched_entity *init_se = se;
+-	unsigned int min_gran;
+-	u64 slice;
+-
+-	if (sched_feat(ALT_PERIOD))
+-		nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
+-
+-	slice = __sched_period(nr_running + !se->on_rq);
+-
+-	for_each_sched_entity(se) {
+-		struct load_weight *load;
+-		struct load_weight lw;
+-		struct cfs_rq *qcfs_rq;
+-
+-		qcfs_rq = cfs_rq_of(se);
+-		load = &qcfs_rq->load;
+-
+-		if (unlikely(!se->on_rq)) {
+-			lw = qcfs_rq->load;
+-
+-			update_load_add(&lw, se->load.weight);
+-			load = &lw;
+-		}
+-		slice = __calc_delta(slice, se->load.weight, load);
+-	}
++	if ((s64)(se->vruntime - se->deadline) < 0)
++		return;
+
+-	if (sched_feat(BASE_SLICE)) {
+-		if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
+-			min_gran = sysctl_sched_idle_min_granularity;
+-		else
+-			min_gran = sysctl_sched_min_granularity;
++	/*
++	 * EEVDF: vd_i = ve_i + r_i / w_i
++	 */
++	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
+
+-		slice = max_t(u64, slice, min_gran);
++	/*
++	 * The task has consumed its request, reschedule.
++	 */
++	if (cfs_rq->nr_running > 1) {
++		resched_curr(rq_of(cfs_rq));
++		clear_buddies(cfs_rq, se);
+ 	}
+-
+-	return slice;
+-}
+-
+-/*
+- * We calculate the vruntime slice of a to-be-inserted task.
+- *
+- * vs = s/w
+- */
+-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+-{
+-	return calc_delta_fair(sched_slice(cfs_rq, se), se);
+ }
+
+ #include "pelt.h"
+@@ -921,6 +1129,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 	schedstat_add(cfs_rq->exec_clock, delta_exec);
+
+ 	curr->vruntime += calc_delta_fair(delta_exec, curr);
++	update_deadline(cfs_rq, curr);
+ 	update_min_vruntime(cfs_rq);
+
+ 	if (entity_is_task(curr)) {
+@@ -3375,16 +3584,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ 			    unsigned long weight)
+ {
++	unsigned long old_weight = se->load.weight;
++
+ 	if (se->on_rq) {
  		/* commit outstanding execution time */
  		if (cfs_rq->curr == se)
  			update_curr(cfs_rq);
@@ -256,7 +1223,29 @@ index d3df5b1642a6f..bb5460682ae2e 100644
  		update_load_sub(&cfs_rq->load, se->load.weight);
  	}
  	dequeue_load_avg(cfs_rq, se);
-@@ -3394,9 +3523,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+
+ 	update_load_set(&se->load, weight);
+
++	if (!se->on_rq) {
++		/*
++		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
++		 * we need to scale se->vlag when w_i changes.
++		 */
++		se->vlag = div_s64(se->vlag * old_weight, weight);
++	} else {
++		s64 deadline = se->deadline - se->vruntime;
++		/*
++		 * When the weight changes, the virtual time slope changes and
++		 * we should adjust the relative virtual deadline accordingly.
++		 */
++		deadline = div_s64(deadline * old_weight, weight);
++		se->deadline = se->vruntime + deadline;
++	}
++
+ #ifdef CONFIG_SMP
+ 	do {
+ 		u32 divider = get_pelt_divider(&se->avg);
+@@ -3394,9 +3623,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
  #endif
 
  	enqueue_load_avg(cfs_rq, se);
@@ -270,208 +1259,38 @@ index d3df5b1642a6f..bb5460682ae2e 100644
  }
 
  void reweight_task(struct task_struct *p, int prio)
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 9baeb1a2dfdd4..52a0a4bde1939 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -548,6 +548,9 @@ struct cfs_rq {
- 	unsigned int		idle_nr_running;   /* SCHED_IDLE */
- 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
+@@ -4692,158 +4923,125 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
 
-+	s64			avg_vruntime;
-+	u64			avg_load;
-+
- 	u64			exec_clock;
- 	u64			min_vruntime;
- #ifdef CONFIG_SCHED_CORE
-@@ -3483,4 +3486,6 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
- static inline void init_sched_mm_cid(struct task_struct *t) { }
- #endif
+ #endif /* CONFIG_SMP */
 
-+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
-+
- #endif /* _KERNEL_SCHED_SCHED_H */
---
-cgit
-
-From e0c2ff903c320d3fd3c2c604dc401b3b7c0a1d13 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 31 May 2023 13:58:41 +0200
-Subject: sched/fair: Remove sched_feat(START_DEBIT)
-
-With the introduction of avg_vruntime() there is no need to use worse
-approximations. Take the 0-lag point as starting point for inserting
-new tasks.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20230531124603.722361178@infradead.org
----
- kernel/sched/fair.c     | 21 +--------------------
- kernel/sched/features.h |  6 ------
- 2 files changed, 1 insertion(+), 26 deletions(-)
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index bb5460682ae2e..fc43482c13e99 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -906,16 +906,6 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 	return slice;
- }
-
--/*
-- * We calculate the vruntime slice of a to-be-inserted task.
-- *
-- * vs = s/w
-- */
--static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 -{
--	return calc_delta_fair(sched_slice(cfs_rq, se), se);
+-#ifdef CONFIG_SCHED_DEBUG
+-	s64 d = se->vruntime - cfs_rq->min_vruntime;
+-
+-	if (d < 0)
+-		d = -d;
+-
+-	if (d > 3*sysctl_sched_latency)
+-		schedstat_inc(cfs_rq->nr_spread_over);
+-#endif
 -}
 -
- #include "pelt.h"
- #ifdef CONFIG_SMP
-
-@@ -4862,16 +4852,7 @@ static inline bool entity_is_long_sleeper(struct sched_entity *se)
- static void
- place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+-static inline bool entity_is_long_sleeper(struct sched_entity *se)
++static void
++place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
--	u64 vruntime = cfs_rq->min_vruntime;
--
--	/*
--	 * The 'current' period is already promised to the current tasks,
--	 * however the extra weight of the new task will slow them down a
--	 * little, place the new task so that it fits in the slot that
--	 * stays open at the end.
--	 */
--	if (initial && sched_feat(START_DEBIT))
--		vruntime += sched_vslice(cfs_rq, se);
-+	u64 vruntime = avg_vruntime(cfs_rq);
-
- 	/* sleeps up to a single latency don't count. */
- 	if (!initial) {
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index ee7f23c76bd33..fa828b36533df 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
-@@ -6,12 +6,6 @@
-  */
- SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
-
--/*
-- * Place new tasks ahead so that they do not starve already running
-- * tasks
-- */
--SCHED_FEAT(START_DEBIT, true)
--
- /*
-  * Prefer to schedule the task we woke last (assuming it failed
-  * wakeup-preemption), since its likely going to consume data we
---
-cgit
-
-From 86bfbb7ce4f67a88df2639198169b685668e7349 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 31 May 2023 13:58:42 +0200
-Subject: sched/fair: Add lag based placement
-
-With the introduction of avg_vruntime, it is possible to approximate
-lag (the entire purpose of introducing it in fact). Use this to do lag
-based placement over sleep+wake.
-
-Specifically, the FAIR_SLEEPERS thing places things too far to the
-left and messes up the deadline aspect of EEVDF.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20230531124603.794929315@infradead.org
----
- include/linux/sched.h   |   3 +-
- kernel/sched/core.c     |   1 +
- kernel/sched/fair.c     | 168 +++++++++++++++++++++++++++++++++++++-----------
- kernel/sched/features.h |   8 +++
- 4 files changed, 141 insertions(+), 39 deletions(-)
-
-diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 2aab7be46f7e8..ba1828b2a6a50 100644
---- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -554,8 +554,9 @@ struct sched_entity {
-
- 	u64				exec_start;
- 	u64				sum_exec_runtime;
--	u64				vruntime;
- 	u64				prev_sum_exec_runtime;
-+	u64				vruntime;
-+	s64				vlag;
-
- 	u64				nr_migrations;
-
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 83e36547af176..84b0d47ed9b85 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -4501,6 +4501,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
- 	p->se.prev_sum_exec_runtime	= 0;
- 	p->se.nr_migrations		= 0;
- 	p->se.vruntime			= 0;
-+	p->se.vlag			= 0;
- 	INIT_LIST_HEAD(&p->se.group_node);
-
- #ifdef CONFIG_FAIR_GROUP_SCHED
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index fc43482c13e99..dd12ada69b121 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -715,6 +715,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
- 	return cfs_rq->min_vruntime + avg;
- }
-
-+/*
-+ * lag_i = S - s_i = w_i * (V - v_i)
-+ */
-+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
-+{
-+	SCHED_WARN_ON(!se->on_rq);
-+	se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
-+}
-+
- static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
- {
- 	u64 min_vruntime = cfs_rq->min_vruntime;
-@@ -3492,6 +3501,8 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
- static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- 			    unsigned long weight)
- {
-+	unsigned long old_weight = se->load.weight;
-+
- 	if (se->on_rq) {
- 		/* commit outstanding execution time */
- 		if (cfs_rq->curr == se)
-@@ -3504,6 +3515,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-
- 	update_load_set(&se->load, weight);
-
-+	if (!se->on_rq) {
-+		/*
-+		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
-+		 * we need to scale se->vlag when w_i changes.
-+		 */
-+		se->vlag = div_s64(se->vlag * old_weight, weight);
-+	}
-+
- #ifdef CONFIG_SMP
- 	do {
- 		u32 divider = get_pelt_divider(&se->avg);
-@@ -4853,49 +4872,119 @@ static void
- place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- {
- 	u64 vruntime = avg_vruntime(cfs_rq);
+-	struct cfs_rq *cfs_rq;
+-	u64 sleep_time;
++	u64 vslice, vruntime = avg_vruntime(cfs_rq);
 +	s64 lag = 0;
 
--	/* sleeps up to a single latency don't count. */
--	if (!initial) {
--		unsigned long thresh;
+-	if (se->exec_start == 0)
+-		return false;
++	se->slice = sysctl_sched_base_slice;
++	vslice = calc_delta_fair(se->slice, se);
+
+-	cfs_rq = cfs_rq_of(se);
 +	/*
 +	 * Due to how V is constructed as the weighted average of entities,
 +	 * adding tasks with positive lag, or removing tasks with negative lag
@@ -480,19 +1299,21 @@ index fc43482c13e99..dd12ada69b121 100644
 +	 *
 +	 * EEVDF: placement strategy #1 / #2
 +	 */
-+	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
++	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
 +		struct sched_entity *curr = cfs_rq->curr;
 +		unsigned long load;
 
--		if (se_is_idle(se))
--			thresh = sysctl_sched_min_granularity;
--		else
--			thresh = sysctl_sched_latency;
+-	sleep_time = rq_clock_task(rq_of(cfs_rq));
 +		lag = se->vlag;
 
- 		/*
--		 * Halve their sleep time's effect, to allow
--		 * for a gentler effect of sleepers:
+-	/* Happen while migrating because of clock task divergence */
+-	if (sleep_time <= se->exec_start)
+-		return false;
+-
+-	sleep_time -= se->exec_start;
+-	if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
+-		return true;
++		/*
 +		 * If we want to place a task and preserve lag, we have to
 +		 * consider the effect of the new entity on the weighted
 +		 * average and compensate for this, otherwise lag can quickly
@@ -543,7 +1364,52 @@ index fc43482c13e99..dd12ada69b121 100644
 +		 *                   = W*vl_i
 +		 *
 +		 *   vl_i = (W + w_i)*vl'_i / W
- 		 */
++		 */
++		load = cfs_rq->avg_load;
++		if (curr && curr->on_rq)
++			load += scale_load_down(curr->load.weight);
+
+-	return false;
+-}
++		lag *= load + scale_load_down(se->load.weight);
++		if (WARN_ON_ONCE(!load))
++			load = 1;
++		lag = div_s64(lag, load);
++	}
+
+-static void
+-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+-{
+-	u64 vruntime = cfs_rq->min_vruntime;
++	se->vruntime = vruntime - lag;
+
+ 	/*
+-	 * The 'current' period is already promised to the current tasks,
+-	 * however the extra weight of the new task will slow them down a
+-	 * little, place the new task so that it fits in the slot that
+-	 * stays open at the end.
++	 * When joining the competition; the exisiting tasks will be,
++	 * on average, halfway through their slice, as such start tasks
++	 * off with half a slice to ease into the competition.
+ 	 */
+-	if (initial && sched_feat(START_DEBIT))
+-		vruntime += sched_vslice(cfs_rq, se);
+-
+-	/* sleeps up to a single latency don't count. */
+-	if (!initial) {
+-		unsigned long thresh;
++	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
++		vslice /= 2;
+
+-		if (se_is_idle(se))
+-			thresh = sysctl_sched_min_granularity;
+-		else
+-			thresh = sysctl_sched_latency;
+-
+-		/*
+-		 * Halve their sleep time's effect, to allow
+-		 * for a gentler effect of sleepers:
+-		 */
 -		if (sched_feat(GENTLE_FAIR_SLEEPERS))
 -			thresh >>= 1;
 -
@@ -573,632 +1439,6 @@ index fc43482c13e99..dd12ada69b121 100644
 -		se->vruntime = vruntime;
 -	else
 -		se->vruntime = max_vruntime(se->vruntime, vruntime);
-+		load = cfs_rq->avg_load;
-+		if (curr && curr->on_rq)
-+			load += curr->load.weight;
-+
-+		lag *= load + se->load.weight;
-+		if (WARN_ON_ONCE(!load))
-+			load = 1;
-+		lag = div_s64(lag, load);
-+
-+		vruntime -= lag;
-+	}
-+
-+	if (sched_feat(FAIR_SLEEPERS)) {
-+
-+		/* sleeps up to a single latency don't count. */
-+		if (!initial) {
-+			unsigned long thresh;
-+
-+			if (se_is_idle(se))
-+				thresh = sysctl_sched_min_granularity;
-+			else
-+				thresh = sysctl_sched_latency;
-+
-+			/*
-+			 * Halve their sleep time's effect, to allow
-+			 * for a gentler effect of sleepers:
-+			 */
-+			if (sched_feat(GENTLE_FAIR_SLEEPERS))
-+				thresh >>= 1;
-+
-+			vruntime -= thresh;
-+		}
-+
-+		/*
-+		 * Pull vruntime of the entity being placed to the base level of
-+		 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
-+		 * slept for a long time, don't even try to compare its vruntime with
-+		 * the base as it may be too far off and the comparison may get
-+		 * inversed due to s64 overflow.
-+		 */
-+		if (!entity_is_long_sleeper(se))
-+			vruntime = max_vruntime(se->vruntime, vruntime);
-+	}
-+
-+	se->vruntime = vruntime;
- }
-
- static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-@@ -5077,6 +5166,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-
- 	clear_buddies(cfs_rq, se);
-
-+	if (flags & DEQUEUE_SLEEP)
-+		update_entity_lag(cfs_rq, se);
-+
- 	if (se != cfs_rq->curr)
- 		__dequeue_entity(cfs_rq, se);
- 	se->on_rq = 0;
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index fa828b36533df..7958a10fe23bb 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
-@@ -1,11 +1,19 @@
- /* SPDX-License-Identifier: GPL-2.0 */
-+
- /*
-  * Only give sleepers 50% of their service deficit. This allows
-  * them to run sooner, but does not allow tons of sleepers to
-  * rip the spread apart.
-  */
-+SCHED_FEAT(FAIR_SLEEPERS, false)
- SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
-
-+/*
-+ * Using the avg_vruntime, do the right thing and preserve lag across
-+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
-+ */
-+SCHED_FEAT(PLACE_LAG, true)
-+
- /*
-  * Prefer to schedule the task we woke last (assuming it failed
-  * wakeup-preemption), since its likely going to consume data we
---
-cgit
-
-From 99d4d26551b56f4e523dd04e4970b94aa796a64e Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 31 May 2023 13:58:43 +0200
-Subject: rbtree: Add rb_add_augmented_cached() helper
-
-While slightly sub-optimal, updating the augmented data while going
-down the tree during lookup would be faster -- alas the augment
-interface does not currently allow for that, provide a generic helper
-to add a node to an augmented cached tree.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20230531124603.862983648@infradead.org
----
- include/linux/rbtree_augmented.h | 26 ++++++++++++++++++++++++++
- 1 file changed, 26 insertions(+)
-
-diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
-index 7ee7ed5de7227..6dbc5a1bf6a8c 100644
---- a/include/linux/rbtree_augmented.h
-+++ b/include/linux/rbtree_augmented.h
-@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node,
- 	rb_insert_augmented(node, &root->rb_root, augment);
- }
-
-+static __always_inline struct rb_node *
-+rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
-+			bool (*less)(struct rb_node *, const struct rb_node *),
-+			const struct rb_augment_callbacks *augment)
-+{
-+	struct rb_node **link = &tree->rb_root.rb_node;
-+	struct rb_node *parent = NULL;
-+	bool leftmost = true;
-+
-+	while (*link) {
-+		parent = *link;
-+		if (less(node, parent)) {
-+			link = &parent->rb_left;
-+		} else {
-+			link = &parent->rb_right;
-+			leftmost = false;
-+		}
-+	}
-+
-+	rb_link_node(node, parent, link);
-+	augment->propagate(parent, NULL); /* suboptimal */
-+	rb_insert_augmented_cached(node, tree, leftmost, augment);
-+
-+	return leftmost ? node : NULL;
-+}
-+
- /*
-  * Template for declaring augmented rbtree callbacks (generic case)
-  *
---
-cgit
-
-From 147f3efaa24182a21706bca15eab2f3f4630b5fe Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 31 May 2023 13:58:44 +0200
-Subject: sched/fair: Implement an EEVDF-like scheduling policy
-
-Where CFS is currently a WFQ based scheduler with only a single knob,
-the weight. The addition of a second, latency oriented parameter,
-makes something like WF2Q or EEVDF based a much better fit.
-
-Specifically, EEVDF does EDF like scheduling in the left half of the
-tree -- those entities that are owed service. Except because this is a
-virtual time scheduler, the deadlines are in virtual time as well,
-which is what allows over-subscription.
-
-EEVDF has two parameters:
-
- - weight, or time-slope: which is mapped to nice just as before
-
- - request size, or slice length: which is used to compute
-   the virtual deadline as: vd_i = ve_i + r_i/w_i
-
-Basically, by setting a smaller slice, the deadline will be earlier
-and the task will be more eligible and ran earlier.
-
-Tick driven preemption is driven by request/slice completion; while
-wakeup preemption is driven by the deadline.
-
-Because the tree is now effectively an interval tree, and the
-selection is no longer 'leftmost', over-scheduling is less of a
-problem.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20230531124603.931005524@infradead.org
----
- include/linux/sched.h   |   4 +
- kernel/sched/core.c     |   1 +
- kernel/sched/debug.c    |   6 +-
- kernel/sched/fair.c     | 338 +++++++++++++++++++++++++++++++++++++++++-------
- kernel/sched/features.h |   3 +
- kernel/sched/sched.h    |   4 +-
- 6 files changed, 308 insertions(+), 48 deletions(-)
-
-diff --git a/include/linux/sched.h b/include/linux/sched.h
-index ba1828b2a6a50..177b3f3676ef8 100644
---- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -549,6 +549,9 @@ struct sched_entity {
- 	/* For load-balancing: */
- 	struct load_weight		load;
- 	struct rb_node			run_node;
-+	u64				deadline;
-+	u64				min_deadline;
-+
- 	struct list_head		group_node;
- 	unsigned int			on_rq;
-
-@@ -557,6 +560,7 @@ struct sched_entity {
- 	u64				prev_sum_exec_runtime;
- 	u64				vruntime;
- 	s64				vlag;
-+	u64				slice;
-
- 	u64				nr_migrations;
-
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 84b0d47ed9b85..e85a2fd258e2b 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -4502,6 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
- 	p->se.nr_migrations		= 0;
- 	p->se.vruntime			= 0;
- 	p->se.vlag			= 0;
-+	p->se.slice			= sysctl_sched_min_granularity;
- 	INIT_LIST_HEAD(&p->se.group_node);
-
- #ifdef CONFIG_FAIR_GROUP_SCHED
-diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index e48d2b2db7bca..18efc6d0cc5ab 100644
---- a/kernel/sched/debug.c
-+++ b/kernel/sched/debug.c
-@@ -582,9 +582,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
- 	else
- 		SEQ_printf(m, " %c", task_state_to_char(p));
-
--	SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
-+	SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
- 		p->comm, task_pid_nr(p),
- 		SPLIT_NS(p->se.vruntime),
-+		entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
-+		SPLIT_NS(p->se.deadline),
-+		SPLIT_NS(p->se.slice),
-+		SPLIT_NS(p->se.sum_exec_runtime),
- 		(long long)(p->nvcsw + p->nivcsw),
- 		p->prio);
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index dd12ada69b121..4d3505dba476e 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -47,6 +47,7 @@
- #include <linux/psi.h>
- #include <linux/ratelimit.h>
- #include <linux/task_work.h>
-+#include <linux/rbtree_augmented.h>
-
- #include <asm/switch_to.h>
-
-@@ -347,6 +348,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
- 	return mul_u64_u32_shr(delta_exec, fact, shift);
- }
-
-+/*
-+ * delta /= w
-+ */
-+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-+{
-+	if (unlikely(se->load.weight != NICE_0_LOAD))
-+		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-+
-+	return delta;
-+}
-
- const struct sched_class fair_sched_class;
-
-@@ -717,11 +728,62 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
-
- /*
-  * lag_i = S - s_i = w_i * (V - v_i)
-+ *
-+ * However, since V is approximated by the weighted average of all entities it
-+ * is possible -- by addition/removal/reweight to the tree -- to move V around
-+ * and end up with a larger lag than we started with.
-+ *
-+ * Limit this to either double the slice length with a minimum of TICK_NSEC
-+ * since that is the timing granularity.
-+ *
-+ * EEVDF gives the following limit for a steady state system:
-+ *
-+ *   -r_max < lag < max(r_max, q)
-+ *
-+ * XXX could add max_slice to the augmented data to track this.
-  */
- void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
-+	s64 lag, limit;
-+
- 	SCHED_WARN_ON(!se->on_rq);
--	se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
-+	lag = avg_vruntime(cfs_rq) - se->vruntime;
-+
-+	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
-+	se->vlag = clamp(lag, -limit, limit);
-+}
-+
-+/*
-+ * Entity is eligible once it received less service than it ought to have,
-+ * eg. lag >= 0.
-+ *
-+ * lag_i = S - s_i = w_i*(V - v_i)
-+ *
-+ * lag_i >= 0 -> V >= v_i
-+ *
-+ *     \Sum (v_i - v)*w_i
-+ * V = ------------------ + v
-+ *          \Sum w_i
-+ *
-+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
-+ *
-+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
-+ *       to the loss in precision caused by the division.
-+ */
-+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
-+{
-+	struct sched_entity *curr = cfs_rq->curr;
-+	s64 avg = cfs_rq->avg_vruntime;
-+	long load = cfs_rq->avg_load;
-+
-+	if (curr && curr->on_rq) {
-+		unsigned long weight = scale_load_down(curr->load.weight);
-+
-+		avg += entity_key(cfs_rq, curr) * weight;
-+		load += weight;
-+	}
-+
-+	return avg >= entity_key(cfs_rq, se) * load;
- }
-
- static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
-@@ -740,8 +802,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
-
- static void update_min_vruntime(struct cfs_rq *cfs_rq)
- {
-+	struct sched_entity *se = __pick_first_entity(cfs_rq);
- 	struct sched_entity *curr = cfs_rq->curr;
--	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
-
- 	u64 vruntime = cfs_rq->min_vruntime;
-
-@@ -752,9 +814,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
- 			curr = NULL;
- 	}
-
--	if (leftmost) { /* non-empty tree */
--		struct sched_entity *se = __node_2_se(leftmost);
--
-+	if (se) {
- 		if (!curr)
- 			vruntime = se->vruntime;
- 		else
-@@ -771,18 +831,50 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
- 	return entity_before(__node_2_se(a), __node_2_se(b));
- }
-
-+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
-+
-+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
-+{
-+	if (node) {
-+		struct sched_entity *rse = __node_2_se(node);
-+		if (deadline_gt(min_deadline, se, rse))
-+			se->min_deadline = rse->min_deadline;
-+	}
-+}
-+
-+/*
-+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
-+ */
-+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
-+{
-+	u64 old_min_deadline = se->min_deadline;
-+	struct rb_node *node = &se->run_node;
-+
-+	se->min_deadline = se->deadline;
-+	__update_min_deadline(se, node->rb_right);
-+	__update_min_deadline(se, node->rb_left);
-+
-+	return se->min_deadline == old_min_deadline;
-+}
-+
-+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
-+		     run_node, min_deadline, min_deadline_update);
-+
- /*
-  * Enqueue an entity into the rb-tree:
-  */
- static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
- 	avg_vruntime_add(cfs_rq, se);
--	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
-+	se->min_deadline = se->deadline;
-+	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
-+				__entity_less, &min_deadline_cb);
- }
-
- static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
--	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
-+	rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
-+				  &min_deadline_cb);
- 	avg_vruntime_sub(cfs_rq, se);
- }
-
-@@ -806,6 +898,97 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
- 	return __node_2_se(next);
- }
-
-+static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-+{
-+	struct sched_entity *left = __pick_first_entity(cfs_rq);
-+
-+	/*
-+	 * If curr is set we have to see if its left of the leftmost entity
-+	 * still in the tree, provided there was anything in the tree at all.
-+	 */
-+	if (!left || (curr && entity_before(curr, left)))
-+		left = curr;
-+
-+	return left;
-+}
-+
-+/*
-+ * Earliest Eligible Virtual Deadline First
-+ *
-+ * In order to provide latency guarantees for different request sizes
-+ * EEVDF selects the best runnable task from two criteria:
-+ *
-+ *  1) the task must be eligible (must be owed service)
-+ *
-+ *  2) from those tasks that meet 1), we select the one
-+ *     with the earliest virtual deadline.
-+ *
-+ * We can do this in O(log n) time due to an augmented RB-tree. The
-+ * tree keeps the entries sorted on service, but also functions as a
-+ * heap based on the deadline by keeping:
-+ *
-+ *  se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
-+ *
-+ * Which allows an EDF like search on (sub)trees.
-+ */
-+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-+{
-+	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
-+	struct sched_entity *curr = cfs_rq->curr;
-+	struct sched_entity *best = NULL;
-+
-+	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
-+		curr = NULL;
-+
-+	while (node) {
-+		struct sched_entity *se = __node_2_se(node);
-+
-+		/*
-+		 * If this entity is not eligible, try the left subtree.
-+		 */
-+		if (!entity_eligible(cfs_rq, se)) {
-+			node = node->rb_left;
-+			continue;
-+		}
-+
-+		/*
-+		 * If this entity has an earlier deadline than the previous
-+		 * best, take this one. If it also has the earliest deadline
-+		 * of its subtree, we're done.
-+		 */
-+		if (!best || deadline_gt(deadline, best, se)) {
-+			best = se;
-+			if (best->deadline == best->min_deadline)
-+				break;
-+		}
-+
-+		/*
-+		 * If the earlest deadline in this subtree is in the fully
-+		 * eligible left half of our space, go there.
-+		 */
-+		if (node->rb_left &&
-+		    __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
-+			node = node->rb_left;
-+			continue;
-+		}
-+
-+		node = node->rb_right;
-+	}
-+
-+	if (!best || (curr && deadline_gt(deadline, best, curr)))
-+		best = curr;
-+
-+	if (unlikely(!best)) {
-+		struct sched_entity *left = __pick_first_entity(cfs_rq);
-+		if (left) {
-+			pr_err("EEVDF scheduling fail, picking leftmost\n");
-+			return left;
-+		}
-+	}
-+
-+	return best;
-+}
-+
- #ifdef CONFIG_SCHED_DEBUG
- struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
- {
-@@ -839,17 +1022,6 @@ int sched_update_scaling(void)
- }
- #endif
-
--/*
-- * delta /= w
-- */
--static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
--{
--	if (unlikely(se->load.weight != NICE_0_LOAD))
--		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
--
--	return delta;
--}
--
- /*
-  * The idea is to set a period in which each task runs once.
-  *
-@@ -915,6 +1087,48 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 	return slice;
- }
-
-+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
-+
-+/*
-+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
-+ * this is probably good enough.
-+ */
-+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
-+{
-+	if ((s64)(se->vruntime - se->deadline) < 0)
-+		return;
-+
-+	if (sched_feat(EEVDF)) {
-+		/*
-+		 * For EEVDF the virtual time slope is determined by w_i (iow.
-+		 * nice) while the request time r_i is determined by
-+		 * sysctl_sched_min_granularity.
-+		 */
-+		se->slice = sysctl_sched_min_granularity;
-+
-+		/*
-+		 * The task has consumed its request, reschedule.
-+		 */
-+		if (cfs_rq->nr_running > 1) {
-+			resched_curr(rq_of(cfs_rq));
-+			clear_buddies(cfs_rq, se);
-+		}
-+	} else {
-+		/*
-+		 * When many tasks blow up the sched_period; it is possible
-+		 * that sched_slice() reports unusually large results (when
-+		 * many tasks are very light for example). Therefore impose a
-+		 * maximum.
-+		 */
-+		se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
-+	}
-+
-+	/*
-+	 * EEVDF: vd_i = ve_i + r_i / w_i
-+	 */
-+	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
-+}
-+
- #include "pelt.h"
- #ifdef CONFIG_SMP
-
-@@ -1047,6 +1261,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
- 	schedstat_add(cfs_rq->exec_clock, delta_exec);
-
- 	curr->vruntime += calc_delta_fair(delta_exec, curr);
-+	update_deadline(cfs_rq, curr);
- 	update_min_vruntime(cfs_rq);
-
- 	if (entity_is_task(curr)) {
-@@ -3521,6 +3736,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- 		 * we need to scale se->vlag when w_i changes.
- 		 */
- 		se->vlag = div_s64(se->vlag * old_weight, weight);
-+	} else {
-+		s64 deadline = se->deadline - se->vruntime;
-+		/*
-+		 * When the weight changes, the virtual time slope changes and
-+		 * we should adjust the relative virtual deadline accordingly.
-+		 */
-+		deadline = div_s64(deadline * old_weight, weight);
-+		se->deadline = se->vruntime + deadline;
- 	}
-
- #ifdef CONFIG_SMP
-@@ -4871,6 +5094,7 @@ static inline bool entity_is_long_sleeper(struct sched_entity *se)
- static void
- place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- {
-+	u64 vslice = calc_delta_fair(se->slice, se);
- 	u64 vruntime = avg_vruntime(cfs_rq);
- 	s64 lag = 0;
-
-@@ -4942,9 +5166,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- 		 */
- 		load = cfs_rq->avg_load;
- 		if (curr && curr->on_rq)
--			load += curr->load.weight;
-+			load += scale_load_down(curr->load.weight);
-
--		lag *= load + se->load.weight;
-+		lag *= load + scale_load_down(se->load.weight);
- 		if (WARN_ON_ONCE(!load))
- 			load = 1;
- 		lag = div_s64(lag, load);
-@@ -4985,6 +5209,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- 	}
-
- 	se->vruntime = vruntime;
-+
-+	/*
-+	 * When joining the competition; the exisiting tasks will be,
-+	 * on average, halfway through their slice, as such start tasks
-+	 * off with half a slice to ease into the competition.
-+	 */
-+	if (sched_feat(PLACE_DEADLINE_INITIAL) && initial)
-+		vslice /= 2;
-+
 +	/*
 +	 * EEVDF: vd_i = ve_i + r_i/w_i
 +	 */
@@ -1206,353 +1446,6 @@ index dd12ada69b121..4d3505dba476e 100644
  }
 
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-@@ -5207,19 +5444,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- static void
- check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
- {
--	unsigned long ideal_runtime, delta_exec;
-+	unsigned long delta_exec;
- 	struct sched_entity *se;
- 	s64 delta;
-
--	/*
--	 * When many tasks blow up the sched_period; it is possible that
--	 * sched_slice() reports unusually large results (when many tasks are
--	 * very light for example). Therefore impose a maximum.
--	 */
--	ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
--
- 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
--	if (delta_exec > ideal_runtime) {
-+	if (delta_exec > curr->slice) {
- 		resched_curr(rq_of(cfs_rq));
- 		/*
- 		 * The current task ran long enough, ensure it doesn't get
-@@ -5243,7 +5473,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
- 	if (delta < 0)
- 		return;
-
--	if (delta > ideal_runtime)
-+	if (delta > curr->slice)
- 		resched_curr(rq_of(cfs_rq));
- }
-
-@@ -5298,17 +5528,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
- static struct sched_entity *
- pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
- {
--	struct sched_entity *left = __pick_first_entity(cfs_rq);
--	struct sched_entity *se;
-+	struct sched_entity *left, *se;
-
--	/*
--	 * If curr is set we have to see if its left of the leftmost entity
--	 * still in the tree, provided there was anything in the tree at all.
--	 */
--	if (!left || (curr && entity_before(curr, left)))
--		left = curr;
-+	if (sched_feat(EEVDF)) {
-+		/*
-+		 * Enabling NEXT_BUDDY will affect latency but not fairness.
-+		 */
-+		if (sched_feat(NEXT_BUDDY) &&
-+		    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
-+			return cfs_rq->next;
-+
-+		return pick_eevdf(cfs_rq);
-+	}
-
--	se = left; /* ideally we run the leftmost entity */
-+	se = left = pick_cfs(cfs_rq, curr);
-
- 	/*
- 	 * Avoid running the skip buddy, if running something else can
-@@ -5401,7 +5634,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
- 		return;
- #endif
-
--	if (cfs_rq->nr_running > 1)
-+	if (!sched_feat(EEVDF) && cfs_rq->nr_running > 1)
- 		check_preempt_tick(cfs_rq, curr);
- }
-
-@@ -6445,13 +6678,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
- static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
- {
- 	struct sched_entity *se = &p->se;
--	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- 	SCHED_WARN_ON(task_rq(p) != rq);
-
- 	if (rq->cfs.h_nr_running > 1) {
--		u64 slice = sched_slice(cfs_rq, se);
- 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
-+		u64 slice = se->slice;
- 		s64 delta = slice - ran;
-
- 		if (delta < 0) {
-@@ -8228,7 +8460,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
- 	if (cse_is_idle != pse_is_idle)
- 		return;
-
--	update_curr(cfs_rq_of(se));
-+	cfs_rq = cfs_rq_of(se);
-+	update_curr(cfs_rq);
-+
-+	if (sched_feat(EEVDF)) {
-+		/*
-+		 * XXX pick_eevdf(cfs_rq) != se ?
-+		 */
-+		if (pick_eevdf(cfs_rq) == pse)
-+			goto preempt;
-+
-+		return;
-+	}
-+
- 	if (wakeup_preempt_entity(se, pse) == 1) {
- 		/*
- 		 * Bias pick_next to pick the sched entity that is
-@@ -8474,7 +8718,7 @@ static void yield_task_fair(struct rq *rq)
-
- 	clear_buddies(cfs_rq, se);
-
--	if (curr->policy != SCHED_BATCH) {
-+	if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) {
- 		update_rq_clock(rq);
- 		/*
- 		 * Update run-time statistics of the 'current'.
-@@ -8487,6 +8731,8 @@ static void yield_task_fair(struct rq *rq)
- 		 */
- 		rq_clock_skip_update(rq);
- 	}
-+	if (sched_feat(EEVDF))
-+		se->deadline += calc_delta_fair(se->slice, se);
-
- 	set_skip_buddy(se);
- }
-@@ -12363,8 +12609,8 @@ static void rq_offline_fair(struct rq *rq)
- static inline bool
- __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
- {
--	u64 slice = sched_slice(cfs_rq_of(se), se);
- 	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
-+	u64 slice = se->slice;
-
- 	return (rtime * min_nr_tasks > slice);
- }
-@@ -13059,7 +13305,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
- 	 * idle runqueue:
- 	 */
- 	if (rq->cfs.load.weight)
--		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
-+		rr_interval = NS_TO_JIFFIES(se->slice);
-
- 	return rr_interval;
- }
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index 7958a10fe23bb..60cce1e6f37b6 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
-@@ -13,6 +13,7 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
-  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
-  */
- SCHED_FEAT(PLACE_LAG, true)
-+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
-
- /*
-  * Prefer to schedule the task we woke last (assuming it failed
-@@ -103,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false)
-
- SCHED_FEAT(ALT_PERIOD, true)
- SCHED_FEAT(BASE_SLICE, true)
-+
-+SCHED_FEAT(EEVDF, true)
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 52a0a4bde1939..aa5b293ca4ed3 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -2505,9 +2505,10 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
- extern const_debug unsigned int sysctl_sched_nr_migrate;
- extern const_debug unsigned int sysctl_sched_migration_cost;
-
-+extern unsigned int sysctl_sched_min_granularity;
-+
- #ifdef CONFIG_SCHED_DEBUG
- extern unsigned int sysctl_sched_latency;
--extern unsigned int sysctl_sched_min_granularity;
- extern unsigned int sysctl_sched_idle_min_granularity;
- extern unsigned int sysctl_sched_wakeup_granularity;
- extern int sysctl_resched_latency_warn_ms;
-@@ -3487,5 +3488,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
- #endif
-
- extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
-+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
-
- #endif /* _KERNEL_SCHED_SCHED_H */
---
-cgit
-
-From 76cae9dbe185b82aeb0640aa2b73da4a8e0088ce Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 31 May 2023 13:58:45 +0200
-Subject: sched/fair: Commit to lag based placement
-
-Removes the FAIR_SLEEPERS code in favour of the new LAG based
-placement.
-
-Specifically, the whole FAIR_SLEEPER thing was a very crude
-approximation to make up for the lack of lag based placement,
-specifically the 'service owed' part. This is important for things
-like 'starve' and 'hackbench'.
-
-One side effect of FAIR_SLEEPER is that it caused 'small' unfairness,
-specifically, by always ignoring up-to 'thresh' sleeptime it would
-have a 50%/50% time distribution for a 50% sleeper vs a 100% runner,
-while strictly speaking this should (of course) result in a 33%/67%
-split (as CFS will also do if the sleep period exceeds 'thresh').
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20230531124604.000198861@infradead.org
----
- kernel/sched/fair.c     | 59 +------------------------------------------------
- kernel/sched/features.h |  8 -------
- 2 files changed, 1 insertion(+), 66 deletions(-)
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 4d3505dba476e..58798dae11b60 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -5068,29 +5068,6 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
- #endif
- }
-
--static inline bool entity_is_long_sleeper(struct sched_entity *se)
--{
--	struct cfs_rq *cfs_rq;
--	u64 sleep_time;
--
--	if (se->exec_start == 0)
--		return false;
--
--	cfs_rq = cfs_rq_of(se);
--
--	sleep_time = rq_clock_task(rq_of(cfs_rq));
--
--	/* Happen while migrating because of clock task divergence */
--	if (sleep_time <= se->exec_start)
--		return false;
--
--	sleep_time -= se->exec_start;
--	if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
--		return true;
--
--	return false;
--}
--
- static void
- place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- {
-@@ -5172,43 +5149,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- 		if (WARN_ON_ONCE(!load))
- 			load = 1;
- 		lag = div_s64(lag, load);
--
--		vruntime -= lag;
--	}
--
--	if (sched_feat(FAIR_SLEEPERS)) {
--
--		/* sleeps up to a single latency don't count. */
--		if (!initial) {
--			unsigned long thresh;
--
--			if (se_is_idle(se))
--				thresh = sysctl_sched_min_granularity;
--			else
--				thresh = sysctl_sched_latency;
--
--			/*
--			 * Halve their sleep time's effect, to allow
--			 * for a gentler effect of sleepers:
--			 */
--			if (sched_feat(GENTLE_FAIR_SLEEPERS))
--				thresh >>= 1;
--
--			vruntime -= thresh;
--		}
--
--		/*
--		 * Pull vruntime of the entity being placed to the base level of
--		 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
--		 * slept for a long time, don't even try to compare its vruntime with
--		 * the base as it may be too far off and the comparison may get
--		 * inversed due to s64 overflow.
--		 */
--		if (!entity_is_long_sleeper(se))
--			vruntime = max_vruntime(se->vruntime, vruntime);
- 	}
-
--	se->vruntime = vruntime;
-+	se->vruntime = vruntime - lag;
-
- 	/*
- 	 * When joining the competition; the exisiting tasks will be,
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index 60cce1e6f37b6..2a830eccda3e9 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
-@@ -1,13 +1,5 @@
- /* SPDX-License-Identifier: GPL-2.0 */
-
--/*
-- * Only give sleepers 50% of their service deficit. This allows
-- * them to run sooner, but does not allow tons of sleepers to
-- * rip the spread apart.
-- */
--SCHED_FEAT(FAIR_SLEEPERS, false)
--SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
--
- /*
-  * Using the avg_vruntime, do the right thing and preserve lag across
-  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
---
-cgit
-
-From e8f331bcc270354a803c2127c486190d33eac441 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 31 May 2023 13:58:46 +0200
-Subject: sched/smp: Use lag to simplify cross-runqueue placement
-
-Using lag is both more correct and simpler when moving between
-runqueues.
-
-Notable, min_vruntime() was invented as a cheap approximation of
-avg_vruntime() for this very purpose (SMP migration). Since we now
-have the real thing; use it.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20230531124604.068911180@infradead.org
----
- kernel/sched/fair.c | 145 +++++++---------------------------------------------
- 1 file changed, 19 insertions(+), 126 deletions(-)
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 58798dae11b60..57e8bc14b06ee 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -5083,7 +5083,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- 	 *
- 	 * EEVDF: placement strategy #1 / #2
- 	 */
--	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
-+	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
- 		struct sched_entity *curr = cfs_rq->curr;
- 		unsigned long load;
-
-@@ -5172,60 +5172,20 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
 
  static inline bool cfs_bandwidth_used(void);
 
@@ -1599,7 +1492,7 @@ index 58798dae11b60..57e8bc14b06ee 100644
 -	if (renorm && curr)
 -		se->vruntime += cfs_rq->min_vruntime;
 +	if (curr)
-+		place_entity(cfs_rq, se, 0);
++		place_entity(cfs_rq, se, flags);
 
  	update_curr(cfs_rq);
 
@@ -1615,7 +1508,7 @@ index 58798dae11b60..57e8bc14b06ee 100644
  	/*
  	 * When enqueuing a sched_entity, we must:
  	 *   - Update loads to have both entity and cfs_rq synced with now.
-@@ -5237,11 +5197,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+@@ -4855,18 +5053,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  	 */
  	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
  	se_update_runnable(se);
@@ -1625,473 +1518,21 @@ index 58798dae11b60..57e8bc14b06ee 100644
 +	 * undo/redo all that. Seems wasteful.
 +	 */
  	update_cfs_group(se);
--	account_entity_enqueue(cfs_rq, se);
-
--	if (flags & ENQUEUE_WAKEUP)
++
 +	/*
 +	 * XXX now that the entity has been re-weighted, and it's lag adjusted,
 +	 * we can place the entity.
 +	 */
 +	if (!curr)
- 		place_entity(cfs_rq, se, 0);
-+
-+	account_entity_enqueue(cfs_rq, se);
++		place_entity(cfs_rq, se, flags);
 +
+ 	account_entity_enqueue(cfs_rq, se);
+
+-	if (flags & ENQUEUE_WAKEUP)
+-		place_entity(cfs_rq, se, 0);
  	/* Entity has migrated, no longer consider this task hot */
  	if (flags & ENQUEUE_MIGRATED)
  		se->exec_start = 0;
-@@ -5346,23 +5317,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-
- 	clear_buddies(cfs_rq, se);
-
--	if (flags & DEQUEUE_SLEEP)
--		update_entity_lag(cfs_rq, se);
--
-+	update_entity_lag(cfs_rq, se);
- 	if (se != cfs_rq->curr)
- 		__dequeue_entity(cfs_rq, se);
- 	se->on_rq = 0;
- 	account_entity_dequeue(cfs_rq, se);
-
--	/*
--	 * Normalize after update_curr(); which will also have moved
--	 * min_vruntime if @se is the one holding it back. But before doing
--	 * update_min_vruntime() again, which will discount @se's position and
--	 * can move min_vruntime forward still more.
--	 */
--	if (!(flags & DEQUEUE_SLEEP))
--		se->vruntime -= cfs_rq->min_vruntime;
--
- 	/* return excess runtime on last dequeue */
- 	return_cfs_rq_runtime(cfs_rq);
-
-@@ -8208,18 +8168,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
- {
- 	struct sched_entity *se = &p->se;
-
--	/*
--	 * As blocked tasks retain absolute vruntime the migration needs to
--	 * deal with this by subtracting the old and adding the new
--	 * min_vruntime -- the latter is done by enqueue_entity() when placing
--	 * the task on the new runqueue.
--	 */
--	if (READ_ONCE(p->__state) == TASK_WAKING) {
--		struct cfs_rq *cfs_rq = cfs_rq_of(se);
--
--		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
--	}
--
- 	if (!task_on_rq_migrating(p)) {
- 		remove_entity_load_avg(se);
-
-@@ -12709,8 +12657,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
-  */
- static void task_fork_fair(struct task_struct *p)
- {
--	struct cfs_rq *cfs_rq;
- 	struct sched_entity *se = &p->se, *curr;
-+	struct cfs_rq *cfs_rq;
- 	struct rq *rq = this_rq();
- 	struct rq_flags rf;
-
-@@ -12719,22 +12667,9 @@ static void task_fork_fair(struct task_struct *p)
-
- 	cfs_rq = task_cfs_rq(current);
- 	curr = cfs_rq->curr;
--	if (curr) {
-+	if (curr)
- 		update_curr(cfs_rq);
--		se->vruntime = curr->vruntime;
--	}
- 	place_entity(cfs_rq, se, 1);
--
--	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
--		/*
--		 * Upon rescheduling, sched_class::put_prev_task() will place
--		 * 'current' within the tree based on its new key value.
--		 */
--		swap(curr->vruntime, se->vruntime);
--		resched_curr(rq);
--	}
--
--	se->vruntime -= cfs_rq->min_vruntime;
- 	rq_unlock(rq, &rf);
- }
-
-@@ -12763,34 +12698,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
- 		check_preempt_curr(rq, p, 0);
- }
-
--static inline bool vruntime_normalized(struct task_struct *p)
--{
--	struct sched_entity *se = &p->se;
--
--	/*
--	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
--	 * the dequeue_entity(.flags=0) will already have normalized the
--	 * vruntime.
--	 */
--	if (p->on_rq)
--		return true;
--
--	/*
--	 * When !on_rq, vruntime of the task has usually NOT been normalized.
--	 * But there are some cases where it has already been normalized:
--	 *
--	 * - A forked child which is waiting for being woken up by
--	 *   wake_up_new_task().
--	 * - A task which has been woken up by try_to_wake_up() and
--	 *   waiting for actually being woken up by sched_ttwu_pending().
--	 */
--	if (!se->sum_exec_runtime ||
--	    (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
--		return true;
--
--	return false;
--}
--
- #ifdef CONFIG_FAIR_GROUP_SCHED
- /*
-  * Propagate the changes of the sched_entity across the tg tree to make it
-@@ -12861,16 +12768,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
- static void detach_task_cfs_rq(struct task_struct *p)
- {
- 	struct sched_entity *se = &p->se;
--	struct cfs_rq *cfs_rq = cfs_rq_of(se);
--
--	if (!vruntime_normalized(p)) {
--		/*
--		 * Fix up our vruntime so that the current sleep doesn't
--		 * cause 'unlimited' sleep bonus.
--		 */
--		place_entity(cfs_rq, se, 0);
--		se->vruntime -= cfs_rq->min_vruntime;
--	}
-
- 	detach_entity_cfs_rq(se);
- }
-@@ -12878,12 +12775,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
- static void attach_task_cfs_rq(struct task_struct *p)
- {
- 	struct sched_entity *se = &p->se;
--	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- 	attach_entity_cfs_rq(se);
--
--	if (!vruntime_normalized(p))
--		se->vruntime += cfs_rq->min_vruntime;
- }
-
- static void switched_from_fair(struct rq *rq, struct task_struct *p)
---
-cgit
-
-From 5e963f2bd4654a202a8a05aa3a86cb0300b10e6c Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 31 May 2023 13:58:47 +0200
-Subject: sched/fair: Commit to EEVDF
-
-EEVDF is a better defined scheduling policy, as a result it has less
-heuristics/tunables. There is no compelling reason to keep CFS around.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20230531124604.137187212@infradead.org
----
- kernel/sched/debug.c    |   6 -
- kernel/sched/fair.c     | 465 ++++--------------------------------------------
- kernel/sched/features.h |  12 --
- kernel/sched/sched.h    |   5 -
- 4 files changed, 38 insertions(+), 450 deletions(-)
-
-diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index 18efc6d0cc5ab..f8d190c7c8c0d 100644
---- a/kernel/sched/debug.c
-+++ b/kernel/sched/debug.c
-@@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
- 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
- #endif
-
--	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
- 	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
--	debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
--	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
-
- 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
- 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
-@@ -866,10 +863,7 @@ static void sched_debug_header(struct seq_file *m)
- 	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
- #define PN(x) \
- 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
--	PN(sysctl_sched_latency);
- 	PN(sysctl_sched_min_granularity);
--	PN(sysctl_sched_idle_min_granularity);
--	PN(sysctl_sched_wakeup_granularity);
- 	P(sysctl_sched_child_runs_first);
- 	P(sysctl_sched_features);
- #undef PN
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 57e8bc14b06ee..0605eb45c58aa 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -57,22 +57,6 @@
- #include "stats.h"
- #include "autogroup.h"
-
--/*
-- * Targeted preemption latency for CPU-bound tasks:
-- *
-- * NOTE: this latency value is not the same as the concept of
-- * 'timeslice length' - timeslices in CFS are of variable length
-- * and have no persistent notion like in traditional, time-slice
-- * based scheduling concepts.
-- *
-- * (to see the precise effective timeslice length of your workload,
-- *  run vmstat and monitor the context-switches (cs) field)
-- *
-- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
-- */
--unsigned int sysctl_sched_latency			= 6000000ULL;
--static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
--
- /*
-  * The initial- and re-scaling of tunables is configurable
-  *
-@@ -94,37 +78,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
- unsigned int sysctl_sched_min_granularity			= 750000ULL;
- static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
-
--/*
-- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
-- * Applies only when SCHED_IDLE tasks compete with normal tasks.
-- *
-- * (default: 0.75 msec)
-- */
--unsigned int sysctl_sched_idle_min_granularity			= 750000ULL;
--
--/*
-- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
-- */
--static unsigned int sched_nr_latency = 8;
--
- /*
-  * After fork, child runs first. If set to 0 (default) then
-  * parent will (try to) run first.
-  */
- unsigned int sysctl_sched_child_runs_first __read_mostly;
-
--/*
-- * SCHED_OTHER wake-up granularity.
-- *
-- * This option delays the preemption effects of decoupled workloads
-- * and reduces their over-scheduling. Synchronous workloads will still
-- * have immediate wakeup/sleep latencies.
-- *
-- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
-- */
--unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
--static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
--
- const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
-
- int sched_thermal_decay_shift;
-@@ -279,8 +238,6 @@ static void update_sysctl(void)
- #define SET_SYSCTL(name) \
- 	(sysctl_##name = (factor) * normalized_sysctl_##name)
- 	SET_SYSCTL(sched_min_granularity);
--	SET_SYSCTL(sched_latency);
--	SET_SYSCTL(sched_wakeup_granularity);
- #undef SET_SYSCTL
- }
-
-@@ -888,30 +845,6 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
- 	return __node_2_se(left);
- }
-
--static struct sched_entity *__pick_next_entity(struct sched_entity *se)
--{
--	struct rb_node *next = rb_next(&se->run_node);
--
--	if (!next)
--		return NULL;
--
--	return __node_2_se(next);
--}
--
--static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr)
--{
--	struct sched_entity *left = __pick_first_entity(cfs_rq);
--
--	/*
--	 * If curr is set we have to see if its left of the leftmost entity
--	 * still in the tree, provided there was anything in the tree at all.
--	 */
--	if (!left || (curr && entity_before(curr, left)))
--		left = curr;
--
--	return left;
--}
--
- /*
-  * Earliest Eligible Virtual Deadline First
-  *
-@@ -1008,85 +941,15 @@ int sched_update_scaling(void)
- {
- 	unsigned int factor = get_update_sysctl_factor();
-
--	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
--					sysctl_sched_min_granularity);
--
- #define WRT_SYSCTL(name) \
- 	(normalized_sysctl_##name = sysctl_##name / (factor))
- 	WRT_SYSCTL(sched_min_granularity);
--	WRT_SYSCTL(sched_latency);
--	WRT_SYSCTL(sched_wakeup_granularity);
- #undef WRT_SYSCTL
-
- 	return 0;
- }
- #endif
-
--/*
-- * The idea is to set a period in which each task runs once.
-- *
-- * When there are too many tasks (sched_nr_latency) we have to stretch
-- * this period because otherwise the slices get too small.
-- *
-- * p = (nr <= nl) ? l : l*nr/nl
-- */
--static u64 __sched_period(unsigned long nr_running)
--{
--	if (unlikely(nr_running > sched_nr_latency))
--		return nr_running * sysctl_sched_min_granularity;
--	else
--		return sysctl_sched_latency;
--}
--
--static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
--
--/*
-- * We calculate the wall-time slice from the period by taking a part
-- * proportional to the weight.
-- *
-- * s = p*P[w/rw]
-- */
--static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
--{
--	unsigned int nr_running = cfs_rq->nr_running;
--	struct sched_entity *init_se = se;
--	unsigned int min_gran;
--	u64 slice;
--
--	if (sched_feat(ALT_PERIOD))
--		nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
--
--	slice = __sched_period(nr_running + !se->on_rq);
--
--	for_each_sched_entity(se) {
--		struct load_weight *load;
--		struct load_weight lw;
--		struct cfs_rq *qcfs_rq;
--
--		qcfs_rq = cfs_rq_of(se);
--		load = &qcfs_rq->load;
--
--		if (unlikely(!se->on_rq)) {
--			lw = qcfs_rq->load;
--
--			update_load_add(&lw, se->load.weight);
--			load = &lw;
--		}
--		slice = __calc_delta(slice, se->load.weight, load);
--	}
--
--	if (sched_feat(BASE_SLICE)) {
--		if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
--			min_gran = sysctl_sched_idle_min_granularity;
--		else
--			min_gran = sysctl_sched_min_granularity;
--
--		slice = max_t(u64, slice, min_gran);
--	}
--
--	return slice;
--}
--
- static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
-
- /*
-@@ -1098,35 +961,25 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 	if ((s64)(se->vruntime - se->deadline) < 0)
- 		return;
-
--	if (sched_feat(EEVDF)) {
--		/*
--		 * For EEVDF the virtual time slope is determined by w_i (iow.
--		 * nice) while the request time r_i is determined by
--		 * sysctl_sched_min_granularity.
--		 */
--		se->slice = sysctl_sched_min_granularity;
--
--		/*
--		 * The task has consumed its request, reschedule.
--		 */
--		if (cfs_rq->nr_running > 1) {
--			resched_curr(rq_of(cfs_rq));
--			clear_buddies(cfs_rq, se);
--		}
--	} else {
--		/*
--		 * When many tasks blow up the sched_period; it is possible
--		 * that sched_slice() reports unusually large results (when
--		 * many tasks are very light for example). Therefore impose a
--		 * maximum.
--		 */
--		se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
--	}
-+	/*
-+	 * For EEVDF the virtual time slope is determined by w_i (iow.
-+	 * nice) while the request time r_i is determined by
-+	 * sysctl_sched_min_granularity.
-+	 */
-+	se->slice = sysctl_sched_min_granularity;
-
- 	/*
- 	 * EEVDF: vd_i = ve_i + r_i / w_i
- 	 */
- 	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
-+
-+	/*
-+	 * The task has consumed its request, reschedule.
-+	 */
-+	if (cfs_rq->nr_running > 1) {
-+		resched_curr(rq_of(cfs_rq));
-+		clear_buddies(cfs_rq, se);
-+	}
- }
-
- #include "pelt.h"
-@@ -5055,19 +4908,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
-
- #endif /* CONFIG_SMP */
-
--static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
--{
--#ifdef CONFIG_SCHED_DEBUG
--	s64 d = se->vruntime - cfs_rq->min_vruntime;
--
--	if (d < 0)
--		d = -d;
--
--	if (d > 3*sysctl_sched_latency)
--		schedstat_inc(cfs_rq->nr_spread_over);
--#endif
--}
--
- static void
- place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- {
-@@ -5219,7 +5059,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
  	check_schedstat_required();
  	update_stats_enqueue_fair(cfs_rq, se, flags);
@@ -2099,7 +1540,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  	if (!curr)
  		__enqueue_entity(cfs_rq, se);
  	se->on_rq = 1;
-@@ -5241,17 +5080,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+@@ -4878,17 +5086,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  	}
  }
 
@@ -2117,7 +1558,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  static void __clear_buddies_next(struct sched_entity *se)
  {
  	for_each_sched_entity(se) {
-@@ -5263,27 +5091,10 @@ static void __clear_buddies_next(struct sched_entity *se)
+@@ -4900,27 +5097,10 @@ static void __clear_buddies_next(struct sched_entity *se)
  	}
  }
 
@@ -2145,7 +1586,29 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  }
 
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-@@ -5341,45 +5152,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+@@ -4954,20 +5134,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+
+ 	clear_buddies(cfs_rq, se);
+
++	update_entity_lag(cfs_rq, se);
+ 	if (se != cfs_rq->curr)
+ 		__dequeue_entity(cfs_rq, se);
+ 	se->on_rq = 0;
+ 	account_entity_dequeue(cfs_rq, se);
+
+-	/*
+-	 * Normalize after update_curr(); which will also have moved
+-	 * min_vruntime if @se is the one holding it back. But before doing
+-	 * update_min_vruntime() again, which will discount @se's position and
+-	 * can move min_vruntime forward still more.
+-	 */
+-	if (!(flags & DEQUEUE_SLEEP))
+-		se->vruntime -= cfs_rq->min_vruntime;
+-
+ 	/* return excess runtime on last dequeue */
+ 	return_cfs_rq_runtime(cfs_rq);
+
+@@ -4986,52 +5158,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  		update_idle_cfs_rq_clock_pelt(cfs_rq);
  }
 
@@ -2155,12 +1618,19 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
 -static void
 -check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 -{
--	unsigned long delta_exec;
+-	unsigned long ideal_runtime, delta_exec;
 -	struct sched_entity *se;
 -	s64 delta;
 -
+-	/*
+-	 * When many tasks blow up the sched_period; it is possible that
+-	 * sched_slice() reports unusually large results (when many tasks are
+-	 * very light for example). Therefore impose a maximum.
+-	 */
+-	ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
+-
 -	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
--	if (delta_exec > curr->slice) {
+-	if (delta_exec > ideal_runtime) {
 -		resched_curr(rq_of(cfs_rq));
 -		/*
 -		 * The current task ran long enough, ensure it doesn't get
@@ -2184,14 +1654,26 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
 -	if (delta < 0)
 -		return;
 -
--	if (delta > curr->slice)
+-	if (delta > ideal_runtime)
 -		resched_curr(rq_of(cfs_rq));
 -}
 -
  static void
  set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-@@ -5418,9 +5190,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+@@ -5047,6 +5173,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 		update_stats_wait_end_fair(cfs_rq, se);
+ 		__dequeue_entity(cfs_rq, se);
+ 		update_load_avg(cfs_rq, se, UPDATE_TG);
++		/*
++		 * HACK, stash a copy of deadline at the point of pick in vlag,
++		 * which isn't used until dequeue.
++		 */
++		se->vlag = se->deadline;
+ 	}
+
+ 	update_stats_curr_start(cfs_rq, se);
+@@ -5070,9 +5201,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  	se->prev_sum_exec_runtime = se->sum_exec_runtime;
  }
 
@@ -2201,24 +1683,21 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  /*
   * Pick the next process, keeping these things in mind, in this order:
   * 1) keep things fair between processes/task groups
-@@ -5431,53 +5200,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+@@ -5083,50 +5211,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  static struct sched_entity *
  pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
  {
--	struct sched_entity *left, *se;
+-	struct sched_entity *left = __pick_first_entity(cfs_rq);
+-	struct sched_entity *se;
 -
--	if (sched_feat(EEVDF)) {
--		/*
--		 * Enabling NEXT_BUDDY will affect latency but not fairness.
--		 */
--		if (sched_feat(NEXT_BUDDY) &&
--		    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
--			return cfs_rq->next;
+-	/*
+-	 * If curr is set we have to see if its left of the leftmost entity
+-	 * still in the tree, provided there was anything in the tree at all.
+-	 */
+-	if (!left || (curr && entity_before(curr, left)))
+-		left = curr;
 -
--		return pick_eevdf(cfs_rq);
--	}
--
--	se = left = pick_cfs(cfs_rq, curr);
+-	se = left; /* ideally we run the leftmost entity */
 -
  	/*
 -	 * Avoid running the skip buddy, if running something else can
@@ -2227,10 +1706,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  	 */
 -	if (cfs_rq->skip && cfs_rq->skip == se) {
 -		struct sched_entity *second;
-+	if (sched_feat(NEXT_BUDDY) &&
-+	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
-+		return cfs_rq->next;
-
+-
 -		if (se == curr) {
 -			second = __pick_first_entity(cfs_rq);
 -		} else {
@@ -2242,7 +1718,10 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
 -		if (second && wakeup_preempt_entity(second, left) < 1)
 -			se = second;
 -	}
--
++	if (sched_feat(NEXT_BUDDY) &&
++	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
++		return cfs_rq->next;
+
 -	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
 -		/*
 -		 * Someone really wants this to run. If it's not unfair, run it.
@@ -2260,7 +1739,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  }
 
  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-@@ -5494,8 +5224,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+@@ -5143,8 +5235,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
  	/* throttle cfs_rqs exceeding runtime */
  	check_cfs_rq_runtime(cfs_rq);
 
@@ -2269,17 +1748,32 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  	if (prev->on_rq) {
  		update_stats_wait_start_fair(cfs_rq, prev);
  		/* Put 'current' back into the tree. */
-@@ -5536,9 +5264,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+@@ -5185,9 +5275,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
  			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
  		return;
  #endif
 -
--	if (!sched_feat(EEVDF) && cfs_rq->nr_running > 1)
+-	if (cfs_rq->nr_running > 1)
 -		check_preempt_tick(cfs_rq, curr);
  }
 
 
-@@ -6610,8 +6335,7 @@ static void hrtick_update(struct rq *rq)
+@@ -6210,13 +6297,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+ {
+ 	struct sched_entity *se = &p->se;
+-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ 	SCHED_WARN_ON(task_rq(p) != rq);
+
+ 	if (rq->cfs.h_nr_running > 1) {
+-		u64 slice = sched_slice(cfs_rq, se);
+ 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
++		u64 slice = se->slice;
+ 		s64 delta = slice - ran;
+
+ 		if (delta < 0) {
+@@ -6240,8 +6326,7 @@ static void hrtick_update(struct rq *rq)
  	if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
  		return;
 
@@ -2289,7 +1783,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  }
  #else /* !CONFIG_SCHED_HRTICK */
  static inline void
-@@ -6652,17 +6376,6 @@ static int sched_idle_rq(struct rq *rq)
+@@ -6282,17 +6367,6 @@ static int sched_idle_rq(struct rq *rq)
  			rq->nr_running);
  }
 
@@ -2307,7 +1801,26 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  #ifdef CONFIG_SMP
  static int sched_idle_cpu(int cpu)
  {
-@@ -8205,66 +7918,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+@@ -7795,18 +7869,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
+ {
+ 	struct sched_entity *se = &p->se;
+
+-	/*
+-	 * As blocked tasks retain absolute vruntime the migration needs to
+-	 * deal with this by subtracting the old and adding the new
+-	 * min_vruntime -- the latter is done by enqueue_entity() when placing
+-	 * the task on the new runqueue.
+-	 */
+-	if (READ_ONCE(p->__state) == TASK_WAKING) {
+-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+-
+-		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
+-	}
+-
+ 	if (!task_on_rq_migrating(p)) {
+ 		remove_entity_load_avg(se);
+
+@@ -7844,66 +7906,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  }
  #endif /* CONFIG_SMP */
 
@@ -2374,7 +1887,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  static void set_next_buddy(struct sched_entity *se)
  {
  	for_each_sched_entity(se) {
-@@ -8276,12 +7929,6 @@ static void set_next_buddy(struct sched_entity *se)
+@@ -7915,12 +7917,6 @@ static void set_next_buddy(struct sched_entity *se)
  	}
  }
 
@@ -2387,7 +1900,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  /*
   * Preempt the current task with a newly woken task if needed:
   */
-@@ -8290,7 +7937,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+@@ -7929,7 +7925,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
  	struct task_struct *curr = rq->curr;
  	struct sched_entity *se = &curr->se, *pse = &p->se;
  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
@@ -2395,7 +1908,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  	int next_buddy_marked = 0;
  	int cse_is_idle, pse_is_idle;
 
-@@ -8306,7 +7952,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+@@ -7945,7 +7940,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
  	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
  		return;
 
@@ -2404,20 +1917,11 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  		set_next_buddy(pse);
  		next_buddy_marked = 1;
  	}
-@@ -8354,44 +8000,16 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
- 	cfs_rq = cfs_rq_of(se);
- 	update_curr(cfs_rq);
+@@ -7990,35 +7985,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+ 	if (cse_is_idle != pse_is_idle)
+ 		return;
 
--	if (sched_feat(EEVDF)) {
--		/*
--		 * XXX pick_eevdf(cfs_rq) != se ?
--		 */
--		if (pick_eevdf(cfs_rq) == pse)
--			goto preempt;
--
--		return;
--	}
--
+-	update_curr(cfs_rq_of(se));
 -	if (wakeup_preempt_entity(se, pse) == 1) {
 -		/*
 -		 * Bias pick_next to pick the sched entity that is
@@ -2425,6 +1929,9 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
 -		 */
 -		if (!next_buddy_marked)
 -			set_next_buddy(pse);
++	cfs_rq = cfs_rq_of(se);
++	update_curr(cfs_rq);
++
 +	/*
 +	 * XXX pick_eevdf(cfs_rq) != se ?
 +	 */
@@ -2453,7 +1960,22 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  }
 
  #ifdef CONFIG_SMP
-@@ -8592,8 +8210,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+@@ -8203,6 +8182,14 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
+ 	return pick_next_task_fair(rq, NULL, NULL);
+ }
+
++static bool eligible_task_fair(struct rq *rq, struct task_struct *p)
++{
++	struct sched_entity *se = &p->se;
++	struct cfs_rq *cfs_rq = cfs_rq_of(se);
++
++	return entity_eligible(cfs_rq, se);
++}
++
+ /*
+  * Account for a descheduled task:
+  */
+@@ -8219,8 +8206,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 
  /*
   * sched_yield() is very simple
@@ -2462,11 +1984,11 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
   */
  static void yield_task_fair(struct rq *rq)
  {
-@@ -8609,23 +8225,19 @@ static void yield_task_fair(struct rq *rq)
+@@ -8236,21 +8221,19 @@ static void yield_task_fair(struct rq *rq)
 
  	clear_buddies(cfs_rq, se);
 
--	if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) {
+-	if (curr->policy != SCHED_BATCH) {
 -		update_rq_clock(rq);
 -		/*
 -		 * Update run-time statistics of the 'current'.
@@ -2479,8 +2001,6 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
 -		 */
 -		rq_clock_skip_update(rq);
 -	}
--	if (sched_feat(EEVDF))
--		se->deadline += calc_delta_fair(se->slice, se);
 +	update_rq_clock(rq);
 +	/*
 +	 * Update run-time statistics of the 'current'.
@@ -2498,7 +2018,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  }
 
  static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
-@@ -8873,8 +8485,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+@@ -8493,8 +8476,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
  	 * Buddy candidates are cache hot:
  	 */
  	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
@@ -2508,858 +2028,117 @@ index 57e8bc14b06ee..0605eb45c58aa 100644
  		return 1;
 
  	if (sysctl_sched_migration_cost == -1)
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index 2a830eccda3e9..54334ca5c5c61 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
-@@ -14,13 +14,6 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
-  */
- SCHED_FEAT(NEXT_BUDDY, false)
-
--/*
-- * Prefer to schedule the task that ran last (when we did
-- * wake-preempt) as that likely will touch the same data, increases
-- * cache locality.
-- */
--SCHED_FEAT(LAST_BUDDY, true)
--
- /*
-  * Consider buddies to be cache hot, decreases the likeliness of a
-  * cache buddy being migrated away, increases cache locality.
-@@ -93,8 +86,3 @@ SCHED_FEAT(UTIL_EST, true)
- SCHED_FEAT(UTIL_EST_FASTUP, true)
-
- SCHED_FEAT(LATENCY_WARN, false)
--
--SCHED_FEAT(ALT_PERIOD, true)
--SCHED_FEAT(BASE_SLICE, true)
--
--SCHED_FEAT(EEVDF, true)
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index aa5b293ca4ed3..f814bb731235d 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -570,8 +570,6 @@ struct cfs_rq {
- 	 */
- 	struct sched_entity	*curr;
- 	struct sched_entity	*next;
--	struct sched_entity	*last;
--	struct sched_entity	*skip;
-
- #ifdef	CONFIG_SCHED_DEBUG
- 	unsigned int		nr_spread_over;
-@@ -2508,9 +2506,6 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
- extern unsigned int sysctl_sched_min_granularity;
-
- #ifdef CONFIG_SCHED_DEBUG
--extern unsigned int sysctl_sched_latency;
--extern unsigned int sysctl_sched_idle_min_granularity;
--extern unsigned int sysctl_sched_wakeup_granularity;
- extern int sysctl_resched_latency_warn_ms;
- extern int sysctl_resched_latency_warn_once;
-
---
-cgit
-
-From e4ec3318a17f5dcf11bc23b2d2c1da4c1c5bb507 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 31 May 2023 13:58:48 +0200
-Subject: sched/debug: Rename sysctl_sched_min_granularity to
- sysctl_sched_base_slice
-
-EEVDF uses this tunable as the base request/slice -- make sure the
-name reflects this.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20230531124604.205287511@infradead.org
----
- kernel/sched/core.c  |  2 +-
- kernel/sched/debug.c |  4 ++--
- kernel/sched/fair.c  | 12 ++++++------
- kernel/sched/sched.h |  2 +-
- 4 files changed, 10 insertions(+), 10 deletions(-)
-
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index e85a2fd258e2b..a5d3422f7d0de 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -4502,7 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
- 	p->se.nr_migrations		= 0;
- 	p->se.vruntime			= 0;
- 	p->se.vlag			= 0;
--	p->se.slice			= sysctl_sched_min_granularity;
-+	p->se.slice			= sysctl_sched_base_slice;
- 	INIT_LIST_HEAD(&p->se.group_node);
-
- #ifdef CONFIG_FAIR_GROUP_SCHED
-diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index f8d190c7c8c0d..4c3d0d9f3db63 100644
---- a/kernel/sched/debug.c
-+++ b/kernel/sched/debug.c
-@@ -347,7 +347,7 @@ static __init int sched_init_debug(void)
- 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
- #endif
-
--	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
-+	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
-
- 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
- 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
-@@ -863,7 +863,7 @@ static void sched_debug_header(struct seq_file *m)
- 	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
- #define PN(x) \
- 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
--	PN(sysctl_sched_min_granularity);
-+	PN(sysctl_sched_base_slice);
- 	P(sysctl_sched_child_runs_first);
- 	P(sysctl_sched_features);
- #undef PN
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 0605eb45c58aa..61747a25d06db 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -75,8 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
-  *
-  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
-  */
--unsigned int sysctl_sched_min_granularity			= 750000ULL;
--static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
-+unsigned int sysctl_sched_base_slice			= 750000ULL;
-+static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
-
- /*
-  * After fork, child runs first. If set to 0 (default) then
-@@ -237,7 +237,7 @@ static void update_sysctl(void)
-
- #define SET_SYSCTL(name) \
- 	(sysctl_##name = (factor) * normalized_sysctl_##name)
--	SET_SYSCTL(sched_min_granularity);
-+	SET_SYSCTL(sched_base_slice);
- #undef SET_SYSCTL
- }
-
-@@ -943,7 +943,7 @@ int sched_update_scaling(void)
-
- #define WRT_SYSCTL(name) \
- 	(normalized_sysctl_##name = sysctl_##name / (factor))
--	WRT_SYSCTL(sched_min_granularity);
-+	WRT_SYSCTL(sched_base_slice);
- #undef WRT_SYSCTL
-
- 	return 0;
-@@ -964,9 +964,9 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 	/*
- 	 * For EEVDF the virtual time slope is determined by w_i (iow.
- 	 * nice) while the request time r_i is determined by
--	 * sysctl_sched_min_granularity.
-+	 * sysctl_sched_base_slice.
- 	 */
--	se->slice = sysctl_sched_min_granularity;
-+	se->slice = sysctl_sched_base_slice;
-
- 	/*
- 	 * EEVDF: vd_i = ve_i + r_i / w_i
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index f814bb731235d..7ff9965570e69 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -2503,7 +2503,7 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
- extern const_debug unsigned int sysctl_sched_nr_migrate;
- extern const_debug unsigned int sysctl_sched_migration_cost;
-
--extern unsigned int sysctl_sched_min_granularity;
-+extern unsigned int sysctl_sched_base_slice;
-
- #ifdef CONFIG_SCHED_DEBUG
- extern int sysctl_resched_latency_warn_ms;
---
-cgit
-
-From d07f09a1f99cabbc86bc5c97d962eb8a466106b5 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 31 May 2023 13:58:49 +0200
-Subject: sched/fair: Propagate enqueue flags into place_entity()
-
-This allows place_entity() to consider ENQUEUE_WAKEUP and
-ENQUEUE_MIGRATED.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20230531124604.274010996@infradead.org
----
- kernel/sched/fair.c  | 10 +++++-----
- kernel/sched/sched.h |  1 +
- 2 files changed, 6 insertions(+), 5 deletions(-)
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 61747a25d06db..5c8c9f7d8496a 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -4909,7 +4909,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
- #endif /* CONFIG_SMP */
-
- static void
--place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+@@ -12004,8 +11986,8 @@ static void rq_offline_fair(struct rq *rq)
+ static inline bool
+ __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
  {
- 	u64 vslice = calc_delta_fair(se->slice, se);
- 	u64 vruntime = avg_vruntime(cfs_rq);
-@@ -4998,7 +4998,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- 	 * on average, halfway through their slice, as such start tasks
- 	 * off with half a slice to ease into the competition.
- 	 */
--	if (sched_feat(PLACE_DEADLINE_INITIAL) && initial)
-+	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
- 		vslice /= 2;
+-	u64 slice = sched_slice(cfs_rq_of(se), se);
+ 	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
++	u64 slice = se->slice;
 
- 	/*
-@@ -5022,7 +5022,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- 	 * update_curr().
- 	 */
- 	if (curr)
--		place_entity(cfs_rq, se, 0);
-+		place_entity(cfs_rq, se, flags);
+ 	return (rtime * min_nr_tasks > slice);
+ }
+@@ -12161,8 +12143,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+  */
+ static void task_fork_fair(struct task_struct *p)
+ {
+-	struct cfs_rq *cfs_rq;
+ 	struct sched_entity *se = &p->se, *curr;
++	struct cfs_rq *cfs_rq;
+ 	struct rq *rq = this_rq();
+ 	struct rq_flags rf;
 
- 	update_curr(cfs_rq);
+@@ -12171,22 +12153,9 @@ static void task_fork_fair(struct task_struct *p)
 
-@@ -5049,7 +5049,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- 	 * we can place the entity.
- 	 */
- 	if (!curr)
--		place_entity(cfs_rq, se, 0);
-+		place_entity(cfs_rq, se, flags);
-
- 	account_entity_enqueue(cfs_rq, se);
-
-@@ -12280,7 +12280,7 @@ static void task_fork_fair(struct task_struct *p)
+ 	cfs_rq = task_cfs_rq(current);
  	curr = cfs_rq->curr;
- 	if (curr)
+-	if (curr) {
++	if (curr)
  		update_curr(cfs_rq);
+-		se->vruntime = curr->vruntime;
+-	}
 -	place_entity(cfs_rq, se, 1);
+-
+-	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
+-		/*
+-		 * Upon rescheduling, sched_class::put_prev_task() will place
+-		 * 'current' within the tree based on its new key value.
+-		 */
+-		swap(curr->vruntime, se->vruntime);
+-		resched_curr(rq);
+-	}
+-
+-	se->vruntime -= cfs_rq->min_vruntime;
 +	place_entity(cfs_rq, se, ENQUEUE_INITIAL);
  	rq_unlock(rq, &rf);
  }
 
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 7ff9965570e69..db5853761b1f3 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -2199,6 +2199,7 @@ extern const u32		sched_prio_to_wmult[40];
- #else
- #define ENQUEUE_MIGRATED	0x00
- #endif
-+#define ENQUEUE_INITIAL		0x80
-
- #define RETRY_TASK		((void *)-1UL)
-
---
-cgit
-
-From 246c6d7ab4d042b185d7df71f437137d43cbb83a Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Sat, 25 Mar 2023 00:14:04 +0100
-Subject: sched/eevdf: Better handle mixed slice length
-
-In the case where (due to latency-nice) there are different request
-sizes in the tree, the smaller requests tend to be dominated by the
-larger. Also note how the EEVDF lag limits are based on r_max.
-
-Therefore; add a heuristic that for the mixed request size case, moves
-smaller requests to placement strategy #2 which ensures they're
-immidiately eligible and and due to their smaller (virtual) deadline
-will cause preemption.
-
-NOTE: this relies on update_entity_lag() to impose lag limits above
-a single slice.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
----
- kernel/sched/fair.c     | 39 +++++++++++++++++++++++++++++++++++++++
- kernel/sched/features.h |  1 +
- kernel/sched/sched.h    |  1 +
- 3 files changed, 41 insertions(+)
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 5c8c9f7d8496a..16949f7bbb172 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 	s64 key = entity_key(cfs_rq, se);
-
- 	cfs_rq->avg_vruntime += key * weight;
-+	cfs_rq->avg_slice += se->slice * weight;
- 	cfs_rq->avg_load += weight;
+@@ -12215,34 +12184,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+ 		check_preempt_curr(rq, p, 0);
  }
 
-@@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 	s64 key = entity_key(cfs_rq, se);
-
- 	cfs_rq->avg_vruntime -= key * weight;
-+	cfs_rq->avg_slice -= se->slice * weight;
- 	cfs_rq->avg_load -= weight;
- }
-
-@@ -4908,6 +4910,30 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
-
- #endif /* CONFIG_SMP */
-
-+static inline bool
-+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags)
-+{
-+	u64 now, vdelta;
-+	s64 delta;
-+
-+	if (!(flags & ENQUEUE_WAKEUP))
-+		return false;
-+
-+	if (flags & ENQUEUE_MIGRATED)
-+		return true;
-+
-+	now = rq_clock_task(rq_of(cfs_rq));
-+	delta = now - se->exec_start;
-+	if (delta < 0)
-+		return false;
-+
-+	vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load);
-+	if (vdelta < vslice)
-+		return false;
-+
-+	return true;
-+}
-+
- static void
- place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- {
-@@ -4929,6 +4955,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-
- 		lag = se->vlag;
-
-+		/*
-+		 * For latency sensitive tasks; those that have a shorter than
-+		 * average slice and do not fully consume the slice, transition
-+		 * to EEVDF placement strategy #2.
-+		 */
-+		if (sched_feat(PLACE_FUDGE) &&
-+		    (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
-+		    entity_has_slept(cfs_rq, se, vslice, flags)) {
-+			lag += vslice;
-+			if (lag > 0)
-+				lag = 0;
-+		}
-+
- 		/*
- 		 * If we want to place a task and preserve lag, we have to
- 		 * consider the effect of the new entity on the weighted
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index 54334ca5c5c61..7d65b40299d91 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
-@@ -5,6 +5,7 @@
-  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
-  */
- SCHED_FEAT(PLACE_LAG, true)
-+SCHED_FEAT(PLACE_FUDGE, true)
- SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
-
- /*
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index db5853761b1f3..bc45beee335c5 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -549,6 +549,7 @@ struct cfs_rq {
- 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
-
- 	s64			avg_vruntime;
-+	u64			avg_slice;
- 	u64			avg_load;
-
- 	u64			exec_clock;
---
-cgit
-
-From 36b9081885fee5764b53970dd2d6afe8c2f13b7f Mon Sep 17 00:00:00 2001
-From: Parth Shah <parth@linux.ibm.com>
-Date: Sat, 11 Mar 2023 12:20:21 +0100
-Subject: sched: Introduce latency-nice as a per-task attribute
-
-Latency-nice indicates the latency requirements of a task with respect
-to the other tasks in the system. The value of the attribute can be within
-the range of [-20, 19] both inclusive to be in-line with the values just
-like task nice values.
-
-Just like task nice, -20 is the 'highest' priority and conveys this
-task should get minimal latency, conversely 19 is the lowest priority
-and conveys this task will get the least consideration and will thus
-receive maximal latency.
-
-[peterz: rebase, squash]
-Signed-off-by: Parth Shah <parth@linux.ibm.com>
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
----
- include/linux/sched.h            |  1 +
- include/uapi/linux/sched.h       |  4 +++-
- include/uapi/linux/sched/types.h | 19 +++++++++++++++++++
- init/init_task.c                 |  3 ++-
- kernel/sched/core.c              | 27 ++++++++++++++++++++++++++-
- kernel/sched/debug.c             |  1 +
- tools/include/uapi/linux/sched.h |  4 +++-
- 7 files changed, 55 insertions(+), 4 deletions(-)
-
-diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 177b3f3676ef8..80bb40a63e9aa 100644
---- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -790,6 +790,7 @@ struct task_struct {
- 	int				static_prio;
- 	int				normal_prio;
- 	unsigned int			rt_priority;
-+	int				latency_prio;
-
- 	struct sched_entity		se;
- 	struct sched_rt_entity		rt;
-diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
-index 3bac0a8ceab26..b2e932c25be62 100644
---- a/include/uapi/linux/sched.h
-+++ b/include/uapi/linux/sched.h
-@@ -132,6 +132,7 @@ struct clone_args {
- #define SCHED_FLAG_KEEP_PARAMS		0x10
- #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
- #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
-+#define SCHED_FLAG_LATENCY_NICE		0x80
-
- #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
- 				 SCHED_FLAG_KEEP_PARAMS)
-@@ -143,6 +144,7 @@ struct clone_args {
- 			 SCHED_FLAG_RECLAIM		| \
- 			 SCHED_FLAG_DL_OVERRUN		| \
- 			 SCHED_FLAG_KEEP_ALL		| \
--			 SCHED_FLAG_UTIL_CLAMP)
-+			 SCHED_FLAG_UTIL_CLAMP		| \
-+			 SCHED_FLAG_LATENCY_NICE)
-
- #endif /* _UAPI_LINUX_SCHED_H */
-diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
-index f2c4589d4dbfe..db1e8199e8c80 100644
---- a/include/uapi/linux/sched/types.h
-+++ b/include/uapi/linux/sched/types.h
-@@ -10,6 +10,7 @@ struct sched_param {
-
- #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
- #define SCHED_ATTR_SIZE_VER1	56	/* add: util_{min,max} */
-+#define SCHED_ATTR_SIZE_VER2	60	/* add: latency_nice */
-
- /*
-  * Extended scheduling parameters data structure.
-@@ -98,6 +99,22 @@ struct sched_param {
-  * scheduled on a CPU with no more capacity than the specified value.
-  *
-  * A task utilization boundary can be reset by setting the attribute to -1.
-+ *
-+ * Latency Tolerance Attributes
-+ * ===========================
-+ *
-+ * A subset of sched_attr attributes allows to specify the relative latency
-+ * requirements of a task with respect to the other tasks running/queued in the
-+ * system.
-+ *
-+ * @ sched_latency_nice	task's latency_nice value
-+ *
-+ * The latency_nice of a task can have any value in a range of
-+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
-+ *
-+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
-+ * taken for a task requiring a lower latency as opposed to the task with
-+ * higher latency_nice.
-  */
- struct sched_attr {
- 	__u32 size;
-@@ -120,6 +137,8 @@ struct sched_attr {
- 	__u32 sched_util_min;
- 	__u32 sched_util_max;
-
-+	/* latency requirement hints */
-+	__s32 sched_latency_nice;
- };
-
- #endif /* _UAPI_LINUX_SCHED_TYPES_H */
-diff --git a/init/init_task.c b/init/init_task.c
-index ff6c4b9bfe6b1..511cbcf3510dc 100644
---- a/init/init_task.c
-+++ b/init/init_task.c
-@@ -78,6 +78,7 @@ struct task_struct init_task
- 	.prio		= MAX_PRIO - 20,
- 	.static_prio	= MAX_PRIO - 20,
- 	.normal_prio	= MAX_PRIO - 20,
-+	.latency_prio	= DEFAULT_PRIO,
- 	.policy		= SCHED_NORMAL,
- 	.cpus_ptr	= &init_task.cpus_mask,
- 	.user_cpus_ptr	= NULL,
-@@ -89,7 +90,7 @@ struct task_struct init_task
- 		.fn = do_no_restart_syscall,
- 	},
- 	.se		= {
--		.group_node 	= LIST_HEAD_INIT(init_task.se.group_node),
-+		.group_node	= LIST_HEAD_INIT(init_task.se.group_node),
- 	},
- 	.rt		= {
- 		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index a5d3422f7d0de..b3533d0d4a2ca 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -4757,6 +4757,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
- 		p->prio = p->normal_prio = p->static_prio;
- 		set_load_weight(p, false);
-
-+		p->latency_prio = NICE_TO_PRIO(0);
-+
- 		/*
- 		 * We don't need the reset flag anymore after the fork. It has
- 		 * fulfilled its duty:
-@@ -7531,7 +7533,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
- #define SETPARAM_POLICY	-1
-
- static void __setscheduler_params(struct task_struct *p,
--		const struct sched_attr *attr)
-+				  const struct sched_attr *attr)
- {
- 	int policy = attr->sched_policy;
-
-@@ -7555,6 +7557,13 @@ static void __setscheduler_params(struct task_struct *p,
- 	set_load_weight(p, true);
- }
-
-+static void __setscheduler_latency(struct task_struct *p,
-+				   const struct sched_attr *attr)
-+{
-+	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
-+		p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
-+}
-+
- /*
-  * Check the target process has a UID that matches the current process's:
-  */
-@@ -7689,6 +7698,13 @@ recheck:
- 			return retval;
- 	}
-
-+	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
-+		if (attr->sched_latency_nice > MAX_NICE)
-+			return -EINVAL;
-+		if (attr->sched_latency_nice < MIN_NICE)
-+			return -EINVAL;
-+	}
-+
- 	/* Update task specific "requested" clamps */
- 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
- 		retval = uclamp_validate(p, attr);
-@@ -7736,6 +7752,9 @@ recheck:
- 			goto change;
- 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
- 			goto change;
-+		if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
-+		    attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
-+			goto change;
-
- 		p->sched_reset_on_fork = reset_on_fork;
- 		retval = 0;
-@@ -7824,6 +7843,7 @@ change:
- 		__setscheduler_params(p, attr);
- 		__setscheduler_prio(p, newprio);
- 	}
-+	__setscheduler_latency(p, attr);
- 	__setscheduler_uclamp(p, attr);
-
- 	if (queued) {
-@@ -8035,6 +8055,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
- 	    size < SCHED_ATTR_SIZE_VER1)
- 		return -EINVAL;
-
-+	if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
-+	    size < SCHED_ATTR_SIZE_VER2)
-+		return -EINVAL;
- 	/*
- 	 * XXX: Do we want to be lenient like existing syscalls; or do we want
- 	 * to be strict and return an error on out-of-bounds values?
-@@ -8272,6 +8295,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- 	get_params(p, &kattr);
- 	kattr.sched_flags &= SCHED_FLAG_ALL;
-
-+	kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio);
-+
- #ifdef CONFIG_UCLAMP_TASK
- 	/*
- 	 * This could race with another potential updater, but this is fine
-diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index 4c3d0d9f3db63..5c743bcb340d2 100644
---- a/kernel/sched/debug.c
-+++ b/kernel/sched/debug.c
-@@ -1086,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
- #endif
- 	P(policy);
- 	P(prio);
-+	P(latency_prio);
- 	if (task_has_dl_policy(p)) {
- 		P(dl.runtime);
- 		P(dl.deadline);
-diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
-index 3bac0a8ceab26..b2e932c25be62 100644
---- a/tools/include/uapi/linux/sched.h
-+++ b/tools/include/uapi/linux/sched.h
-@@ -132,6 +132,7 @@ struct clone_args {
- #define SCHED_FLAG_KEEP_PARAMS		0x10
- #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
- #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
-+#define SCHED_FLAG_LATENCY_NICE		0x80
-
- #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
- 				 SCHED_FLAG_KEEP_PARAMS)
-@@ -143,6 +144,7 @@ struct clone_args {
- 			 SCHED_FLAG_RECLAIM		| \
- 			 SCHED_FLAG_DL_OVERRUN		| \
- 			 SCHED_FLAG_KEEP_ALL		| \
--			 SCHED_FLAG_UTIL_CLAMP)
-+			 SCHED_FLAG_UTIL_CLAMP		| \
-+			 SCHED_FLAG_LATENCY_NICE)
-
- #endif /* _UAPI_LINUX_SCHED_H */
---
-cgit
-
-From 9f9a3323112d3aa5afa466b1e391e137f28dc79d Mon Sep 17 00:00:00 2001
-From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
-Date: Fri, 24 Feb 2023 10:34:51 +0100
-Subject: sched/fair: Implement latency-nice
-
-Implement latency-nice as a modulation of the EEVDF r_i parameter,
-specifically apply the inverse sched_prio_to_weight[] relation on
-base_slice.
-
-Given a base slice of 3 [ms], this gives a range of:
-
-  latency-nice  19: 3*1024 / 15    ~= 204.8 [ms]
-  latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms]
-
-(which might not make sense)
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
----
- kernel/sched/core.c  | 14 ++++++++++----
- kernel/sched/fair.c  | 22 +++++++++++++++-------
- kernel/sched/sched.h |  2 ++
- 3 files changed, 27 insertions(+), 11 deletions(-)
-
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index b3533d0d4a2ca..263caac8f76b7 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
- 	}
- }
-
-+static inline void set_latency_prio(struct task_struct *p, int prio)
-+{
-+	p->latency_prio = prio;
-+	set_latency_fair(&p->se, prio - MAX_RT_PRIO);
-+}
-+
- #ifdef CONFIG_UCLAMP_TASK
- /*
-  * Serializes updates of utilization clamp values
-@@ -4502,9 +4508,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
- 	p->se.nr_migrations		= 0;
- 	p->se.vruntime			= 0;
- 	p->se.vlag			= 0;
--	p->se.slice			= sysctl_sched_base_slice;
- 	INIT_LIST_HEAD(&p->se.group_node);
-
-+	set_latency_prio(p, p->latency_prio);
-+
- #ifdef CONFIG_FAIR_GROUP_SCHED
- 	p->se.cfs_rq			= NULL;
- #endif
-@@ -4756,8 +4763,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
-
- 		p->prio = p->normal_prio = p->static_prio;
- 		set_load_weight(p, false);
+-static inline bool vruntime_normalized(struct task_struct *p)
+-{
+-	struct sched_entity *se = &p->se;
 -
--		p->latency_prio = NICE_TO_PRIO(0);
-+		set_latency_prio(p, NICE_TO_PRIO(0));
-
- 		/*
- 		 * We don't need the reset flag anymore after the fork. It has
-@@ -7561,7 +7567,7 @@ static void __setscheduler_latency(struct task_struct *p,
- 				   const struct sched_attr *attr)
- {
- 	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
--		p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
-+		set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
- }
-
- /*
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 16949f7bbb172..c2019e7d46cf5 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -952,6 +952,21 @@ int sched_update_scaling(void)
- }
- #endif
-
-+void set_latency_fair(struct sched_entity *se, int prio)
-+{
-+	u32 weight = sched_prio_to_weight[prio];
-+	u64 base = sysctl_sched_base_slice;
-+
-+	/*
-+	 * For EEVDF the virtual time slope is determined by w_i (iow.
-+	 * nice) while the request time r_i is determined by
-+	 * latency-nice.
-+	 *
-+	 * Smaller request gets better latency.
-+	 */
-+	se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
-+}
-+
- static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
-
- /*
-@@ -963,13 +978,6 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 	if ((s64)(se->vruntime - se->deadline) < 0)
- 		return;
-
 -	/*
--	 * For EEVDF the virtual time slope is determined by w_i (iow.
--	 * nice) while the request time r_i is determined by
--	 * sysctl_sched_base_slice.
+-	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
+-	 * the dequeue_entity(.flags=0) will already have normalized the
+-	 * vruntime.
 -	 */
--	se->slice = sysctl_sched_base_slice;
+-	if (p->on_rq)
+-		return true;
 -
- 	/*
- 	 * EEVDF: vd_i = ve_i + r_i / w_i
- 	 */
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index bc45beee335c5..8f8d903a01892 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -2520,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
- extern unsigned int sysctl_numa_balancing_hot_threshold;
- #endif
-
-+extern void set_latency_fair(struct sched_entity *se, int prio);
-+
- #ifdef CONFIG_SCHED_HRTICK
-
+-	/*
+-	 * When !on_rq, vruntime of the task has usually NOT been normalized.
+-	 * But there are some cases where it has already been normalized:
+-	 *
+-	 * - A forked child which is waiting for being woken up by
+-	 *   wake_up_new_task().
+-	 * - A task which has been woken up by try_to_wake_up() and
+-	 *   waiting for actually being woken up by sched_ttwu_pending().
+-	 */
+-	if (!se->sum_exec_runtime ||
+-	    (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
+-		return true;
+-
+-	return false;
+-}
+-
+ #ifdef CONFIG_FAIR_GROUP_SCHED
  /*
---
-cgit
-
-From a317f35154852bc023a7ab2e3fa491e1897af72f Mon Sep 17 00:00:00 2001
-From: Vincent Guittot <vincent.guittot@linaro.org>
-Date: Fri, 24 Feb 2023 10:34:52 +0100
-Subject: sched/fair: Add sched group latency support
-
-Task can set its latency priority with sched_setattr(), which is then used
-to set the latency offset of its sched_enity, but sched group entities
-still have the default latency offset value.
-
-Add a latency.nice field in cpu cgroup controller to set the latency
-priority of the group similarly to sched_setattr(). The latency priority
-is then used to set the offset of the sched_entities of the group.
-
-Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
-Link: https://lkml.kernel.org/r/20230224093454.956298-7-vincent.guittot@linaro.org
----
- Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++
- kernel/sched/core.c                     | 30 ++++++++++++++++++++++++++++++
- kernel/sched/fair.c                     | 27 +++++++++++++++++++++++++++
- kernel/sched/sched.h                    |  4 ++++
- 4 files changed, 71 insertions(+)
-
-diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
-index 4ef8901911961..3a8d3e1e55910 100644
---- a/Documentation/admin-guide/cgroup-v2.rst
-+++ b/Documentation/admin-guide/cgroup-v2.rst
-@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
-         values similar to the sched_setattr(2). This maximum utilization
-         value is used to clamp the task specific maximum utilization clamp.
-
-+  cpu.latency.nice
-+	A read-write single value file which exists on non-root
-+	cgroups.  The default is "0".
-+
-+	The nice value is in the range [-20, 19].
-+
-+	This interface file allows reading and setting latency using the
-+	same values used by sched_setattr(2). The latency_nice of a group is
-+	used to limit the impact of the latency_nice of a task outside the
-+	group.
-
-
- Memory
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 263caac8f76b7..8a541fe2d4626 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -11247,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+  * Propagate the changes of the sched_entity across the tg tree to make it
+@@ -12313,16 +12254,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
+ static void detach_task_cfs_rq(struct task_struct *p)
  {
- 	return sched_group_set_idle(css_tg(css), idle);
- }
-+
-+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
-+				    struct cftype *cft)
-+{
-+	return PRIO_TO_NICE(css_tg(css)->latency_prio);
-+}
-+
-+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
-+				     struct cftype *cft, s64 nice)
-+{
-+	int prio;
-+
-+	if (nice < MIN_NICE || nice > MAX_NICE)
-+		return -ERANGE;
-+
-+	prio = NICE_TO_PRIO(nice);
-+
-+	return sched_group_set_latency(css_tg(css), prio);
-+}
- #endif
+ 	struct sched_entity *se = &p->se;
+-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+-
+-	if (!vruntime_normalized(p)) {
+-		/*
+-		 * Fix up our vruntime so that the current sleep doesn't
+-		 * cause 'unlimited' sleep bonus.
+-		 */
+-		place_entity(cfs_rq, se, 0);
+-		se->vruntime -= cfs_rq->min_vruntime;
+-	}
 
- static struct cftype cpu_legacy_files[] = {
-@@ -11261,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = {
- 		.read_s64 = cpu_idle_read_s64,
- 		.write_s64 = cpu_idle_write_s64,
- 	},
-+	{
-+		.name = "latency.nice",
-+		.read_s64 = cpu_latency_nice_read_s64,
-+		.write_s64 = cpu_latency_nice_write_s64,
-+	},
- #endif
- #ifdef CONFIG_CFS_BANDWIDTH
- 	{
-@@ -11500,6 +11524,12 @@ static struct cftype cpu_files[] = {
- 		.read_s64 = cpu_idle_read_s64,
- 		.write_s64 = cpu_idle_write_s64,
- 	},
-+	{
-+		.name = "latency.nice",
-+		.flags = CFTYPE_NOT_ON_ROOT,
-+		.read_s64 = cpu_latency_nice_read_s64,
-+		.write_s64 = cpu_latency_nice_write_s64,
-+	},
- #endif
- #ifdef CONFIG_CFS_BANDWIDTH
- 	{
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index c2019e7d46cf5..8a4799c600309 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -12545,6 +12545,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+ 	detach_entity_cfs_rq(se);
+ }
+@@ -12330,12 +12261,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
+ static void attach_task_cfs_rq(struct task_struct *p)
+ {
+ 	struct sched_entity *se = &p->se;
+-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ 	attach_entity_cfs_rq(se);
+-
+-	if (!vruntime_normalized(p))
+-		se->vruntime += cfs_rq->min_vruntime;
+ }
+
+ static void switched_from_fair(struct rq *rq, struct task_struct *p)
+@@ -12446,6 +12373,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  		goto err;
 
  	tg->shares = NICE_0_LOAD;
@@ -3367,7 +2146,7 @@ index c2019e7d46cf5..8a4799c600309 100644
 
  	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
-@@ -12643,6 +12644,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+@@ -12544,6 +12472,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
  	}
 
  	se->my_q = cfs_rq;
@@ -3377,7 +2156,7 @@ index c2019e7d46cf5..8a4799c600309 100644
  	/* guarantee group entities always have weight */
  	update_load_set(&se->load, NICE_0_LOAD);
  	se->parent = parent;
-@@ -12773,6 +12777,29 @@ next_cpu:
+@@ -12674,6 +12605,29 @@ int sched_group_set_idle(struct task_group *tg, long idle)
  	return 0;
  }
 
@@ -3407,8 +2186,73 @@ index c2019e7d46cf5..8a4799c600309 100644
  #else /* CONFIG_FAIR_GROUP_SCHED */
 
  void free_fair_sched_group(struct task_group *tg) { }
+@@ -12700,7 +12654,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
+ 	 * idle runqueue:
+ 	 */
+ 	if (rq->cfs.load.weight)
+-		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
++		rr_interval = NS_TO_JIFFIES(se->slice);
+
+ 	return rr_interval;
+ }
+@@ -12717,6 +12671,7 @@ DEFINE_SCHED_CLASS(fair) = {
+
+ 	.check_preempt_curr	= check_preempt_wakeup,
+
++	.eligible_task		= eligible_task_fair,
+ 	.pick_next_task		= __pick_next_task_fair,
+ 	.put_prev_task		= put_prev_task_fair,
+ 	.set_next_task          = set_next_task_fair,
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index ee7f23c76..5ae5a6f92 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -1,16 +1,13 @@
+ /* SPDX-License-Identifier: GPL-2.0 */
+-/*
+- * Only give sleepers 50% of their service deficit. This allows
+- * them to run sooner, but does not allow tons of sleepers to
+- * rip the spread apart.
+- */
+-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
+
+ /*
+- * Place new tasks ahead so that they do not starve already running
+- * tasks
++ * Using the avg_vruntime, do the right thing and preserve lag across
++ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
+  */
+-SCHED_FEAT(START_DEBIT, true)
++SCHED_FEAT(PLACE_LAG, true)
++SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
++SCHED_FEAT(RUN_TO_PARITY, true)
++SCHED_FEAT(DELAY_DEQUEUE, true)
+
+ /*
+  * Prefer to schedule the task we woke last (assuming it failed
+@@ -19,13 +16,6 @@ SCHED_FEAT(START_DEBIT, true)
+  */
+ SCHED_FEAT(NEXT_BUDDY, false)
+
+-/*
+- * Prefer to schedule the task that ran last (when we did
+- * wake-preempt) as that likely will touch the same data, increases
+- * cache locality.
+- */
+-SCHED_FEAT(LAST_BUDDY, true)
+-
+ /*
+  * Consider buddies to be cache hot, decreases the likeliness of a
+  * cache buddy being migrated away, increases cache locality.
+@@ -98,6 +88,3 @@ SCHED_FEAT(UTIL_EST, true)
+ SCHED_FEAT(UTIL_EST_FASTUP, true)
+
+ SCHED_FEAT(LATENCY_WARN, false)
+-
+-SCHED_FEAT(ALT_PERIOD, true)
+-SCHED_FEAT(BASE_SLICE, true)
 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 8f8d903a01892..4236c4c893aa7 100644
+index e93e006a9..050b447d2 100644
 --- a/kernel/sched/sched.h
 +++ b/kernel/sched/sched.h
 @@ -372,6 +372,8 @@ struct task_group {
@@ -3429,212 +2273,102 @@ index 8f8d903a01892..4236c4c893aa7 100644
  #ifdef CONFIG_SMP
  extern void set_task_rq_fair(struct sched_entity *se,
  			     struct cfs_rq *prev, struct cfs_rq *next);
---
-cgit
+@@ -548,6 +552,9 @@ struct cfs_rq {
+ 	unsigned int		idle_nr_running;   /* SCHED_IDLE */
+ 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
 
-From b412068f928064d23f67709f46d36d7659079e54 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Mon, 22 May 2023 13:46:30 +0200
-Subject: sched/eevdf: Use sched_attr::sched_runtime to set request/slice
-
-As an alternative to the latency-nice interface; allow applications to
-directly set the request/slice using sched_attr::sched_runtime.
-
-The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
-which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
-
-Applications should strive to use their periodic runtime at a high
-confidence interval (95%+) as the target slice. Using a smaller slice
-will introduce undue preemptions, while using a larger value will
-increase latency.
-
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
----
- kernel/sched/core.c | 24 ++++++++++++++++++------
- 1 file changed, 18 insertions(+), 6 deletions(-)
-
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 8a541fe2d4626..5b71c398f6cf6 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -7548,10 +7548,18 @@ static void __setscheduler_params(struct task_struct *p,
-
- 	p->policy = policy;
-
--	if (dl_policy(policy))
-+	if (dl_policy(policy)) {
- 		__setparam_dl(p, attr);
--	else if (fair_policy(policy))
-+	} else if (fair_policy(policy)) {
- 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
-+		if (attr->sched_runtime) {
-+			p->se.slice = clamp_t(u64, attr->sched_runtime,
-+					      NSEC_PER_MSEC/10,   /* HZ=1000 * 10 */
-+					      NSEC_PER_MSEC*100); /* HZ=100  / 10 */
-+		} else {
-+			p->se.slice = sysctl_sched_base_slice;
-+		}
-+	}
-
- 	/*
- 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
-@@ -7750,7 +7758,9 @@ recheck:
- 	 * but store a possible modification of reset_on_fork.
++	s64			avg_vruntime;
++	u64			avg_load;
++
+ 	u64			exec_clock;
+ 	u64			min_vruntime;
+ #ifdef CONFIG_SCHED_CORE
+@@ -567,8 +574,6 @@ struct cfs_rq {
  	 */
- 	if (unlikely(policy == p->policy)) {
--		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
-+		if (fair_policy(policy) &&
-+		    (attr->sched_nice != task_nice(p) ||
-+		     (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
- 			goto change;
- 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
- 			goto change;
-@@ -8079,12 +8089,14 @@ err_size:
+ 	struct sched_entity	*curr;
+ 	struct sched_entity	*next;
+-	struct sched_entity	*last;
+-	struct sched_entity	*skip;
 
- static void get_params(struct task_struct *p, struct sched_attr *attr)
+ #ifdef	CONFIG_SCHED_DEBUG
+ 	unsigned int		nr_spread_over;
+@@ -2195,6 +2200,7 @@ extern const u32		sched_prio_to_wmult[40];
+ #else
+ #define ENQUEUE_MIGRATED	0x00
+ #endif
++#define ENQUEUE_INITIAL		0x80
+
+ #define RETRY_TASK		((void *)-1UL)
+
+@@ -2217,6 +2223,7 @@ struct sched_class {
+
+ 	void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
+
++	bool (*eligible_task)(struct rq *rq, struct task_struct *p);
+ 	struct task_struct *(*pick_next_task)(struct rq *rq);
+
+ 	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+@@ -2270,7 +2277,7 @@ struct sched_class {
+
+ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
  {
--	if (task_has_dl_policy(p))
-+	if (task_has_dl_policy(p)) {
- 		__getparam_dl(p, attr);
--	else if (task_has_rt_policy(p))
-+	} else if (task_has_rt_policy(p)) {
- 		attr->sched_priority = p->rt_priority;
--	else
-+	} else {
- 		attr->sched_nice = task_nice(p);
-+		attr->sched_runtime = p->se.slice;
-+	}
+-	WARN_ON_ONCE(rq->curr != prev);
++//	WARN_ON_ONCE(rq->curr != prev);
+ 	prev->sched_class->put_prev_task(rq, prev);
  }
 
- /**
---
-cgit
+@@ -2499,11 +2506,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+ extern const_debug unsigned int sysctl_sched_nr_migrate;
+ extern const_debug unsigned int sysctl_sched_migration_cost;
 
-From 2f88c8e802c8b128a155976631f4eb2ce4f3c805 Mon Sep 17 00:00:00 2001
-From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
-Date: Thu, 24 Aug 2023 13:33:42 +0530
-Subject: sched/eevdf/doc: Modify the documented knob to base_slice_ns as well
-
-After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns'
-sysctl to 'base_slice_ns':
-
-   e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
-
-... but we forgot to rename it in the documentation. Do that now.
-
-Fixes: e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
-Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Link: https://lore.kernel.org/r/20230824080342.543396-1-sshegde@linux.vnet.ibm.com
----
- Documentation/scheduler/sched-design-CFS.rst | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
-index 03db555045151..f68919800f050 100644
---- a/Documentation/scheduler/sched-design-CFS.rst
-+++ b/Documentation/scheduler/sched-design-CFS.rst
-@@ -94,7 +94,7 @@ other HZ detail.  Thus the CFS scheduler has no notion of "timeslices" in the
- way the previous scheduler had, and has no heuristics whatsoever.  There is
- only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
-
--   /sys/kernel/debug/sched/min_granularity_ns
-+   /sys/kernel/debug/sched/base_slice_ns
-
- which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
- "server" (i.e., good batching) workloads.  It defaults to a setting suitable
---
-cgit
-
-From 63304558ba5dcaaff9e052ee43cfdcc7f9c29e85 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Wed, 16 Aug 2023 15:40:59 +0200
-Subject: sched/eevdf: Curb wakeup-preemption
-
-Mike and others noticed that EEVDF does like to over-schedule quite a
-bit -- which does hurt performance of a number of benchmarks /
-workloads.
-
-In particular, what seems to cause over-scheduling is that when lag is
-of the same order (or larger) than the request / slice then placement
-will not only cause the task to be placed left of current, but also
-with a smaller deadline than current, which causes immediate
-preemption.
-
-[ notably, lag bounds are relative to HZ ]
-
-Mike suggested we stick to picking 'current' for as long as it's
-eligible to run, giving it uninterrupted runtime until it reaches
-parity with the pack.
-
-Augment Mike's suggestion by only allowing it to exhaust it's initial
-request.
-
-One random data point:
-
-echo NO_RUN_TO_PARITY > /debug/sched/features
-perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
-
-	3,723,554        context-switches      ( +-  0.56% )
-	9.5136 +- 0.0394 seconds time elapsed  ( +-  0.41% )
-
-echo RUN_TO_PARITY > /debug/sched/features
-perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
-
-	2,556,535        context-switches      ( +-  0.51% )
-	9.2427 +- 0.0302 seconds time elapsed  ( +-  0.33% )
-
-Suggested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net
----
- kernel/sched/fair.c     | 12 ++++++++++++
- kernel/sched/features.h |  1 +
- 2 files changed, 13 insertions(+)
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index f496cef90ce77..0b7445cd5af98 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -873,6 +873,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
- 	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
- 		curr = NULL;
-
-+	/*
-+	 * Once selected, run a task until it either becomes non-eligible or
-+	 * until it gets a new slice. See the HACK in set_next_entity().
-+	 */
-+	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
-+		return curr;
++extern unsigned int sysctl_sched_base_slice;
 +
- 	while (node) {
- 		struct sched_entity *se = __node_2_se(node);
+ #ifdef CONFIG_SCHED_DEBUG
+-extern unsigned int sysctl_sched_latency;
+-extern unsigned int sysctl_sched_min_granularity;
+-extern unsigned int sysctl_sched_idle_min_granularity;
+-extern unsigned int sysctl_sched_wakeup_granularity;
+ extern int sysctl_resched_latency_warn_ms;
+ extern int sysctl_resched_latency_warn_once;
 
-@@ -5167,6 +5174,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 		update_stats_wait_end_fair(cfs_rq, se);
- 		__dequeue_entity(cfs_rq, se);
- 		update_load_avg(cfs_rq, se, UPDATE_TG);
-+		/*
-+		 * HACK, stash a copy of deadline at the point of pick in vlag,
-+		 * which isn't used until dequeue.
-+		 */
-+		se->vlag = se->deadline;
- 	}
+@@ -2516,6 +2521,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #endif
 
- 	update_stats_curr_start(cfs_rq, se);
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index 61bcbf5e46a45..f770168230ae4 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
-@@ -6,6 +6,7 @@
-  */
- SCHED_FEAT(PLACE_LAG, true)
- SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
-+SCHED_FEAT(RUN_TO_PARITY, true)
++extern void set_latency_fair(struct sched_entity *se, int prio);
++
+ #ifdef CONFIG_SCHED_HRTICK
 
  /*
-  * Prefer to schedule the task we woke last (assuming it failed
---
-cgit
+@@ -3480,4 +3487,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+ static inline void init_sched_mm_cid(struct task_struct *t) { }
+ #endif
+
++extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
++extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
++
+ #endif /* _KERNEL_SCHED_SCHED_H */
+diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
+index 3bac0a8ce..b2e932c25 100644
+--- a/tools/include/uapi/linux/sched.h
++++ b/tools/include/uapi/linux/sched.h
+@@ -132,6 +132,7 @@ struct clone_args {
+ #define SCHED_FLAG_KEEP_PARAMS		0x10
+ #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+ #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
++#define SCHED_FLAG_LATENCY_NICE		0x80
+
+ #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+ 				 SCHED_FLAG_KEEP_PARAMS)
+@@ -143,6 +144,7 @@ struct clone_args {
+ 			 SCHED_FLAG_RECLAIM		| \
+ 			 SCHED_FLAG_DL_OVERRUN		| \
+ 			 SCHED_FLAG_KEEP_ALL		| \
+-			 SCHED_FLAG_UTIL_CLAMP)
++			 SCHED_FLAG_UTIL_CLAMP		| \
++			 SCHED_FLAG_LATENCY_NICE)
+
+ #endif /* _UAPI_LINUX_SCHED_H */
+--
+2.42.0