Update EEVDF patches (#802)

This commit is contained in:
kylon
2023-08-31 17:02:05 +02:00
committed by GitHub
parent 7b4bf31ffb
commit 2114c55a35
2 changed files with 2162 additions and 398 deletions

View File

@@ -2756,3 +2756,885 @@ index 7ff9965570e69..db5853761b1f3 100644
--
cgit
From 246c6d7ab4d042b185d7df71f437137d43cbb83a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 25 Mar 2023 00:14:04 +0100
Subject: sched/eevdf: Better handle mixed slice length
In the case where (due to latency-nice) there are different request
sizes in the tree, the smaller requests tend to be dominated by the
larger. Also note how the EEVDF lag limits are based on r_max.
Therefore; add a heuristic that for the mixed request size case, moves
smaller requests to placement strategy #2 which ensures they're
immidiately eligible and and due to their smaller (virtual) deadline
will cause preemption.
NOTE: this relies on update_entity_lag() to impose lag limits above
a single slice.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++++
kernel/sched/features.h | 1 +
kernel/sched/sched.h | 1 +
3 files changed, 41 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c8c9f7d8496a..16949f7bbb172 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
s64 key = entity_key(cfs_rq, se);
cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_slice += se->slice * weight;
cfs_rq->avg_load += weight;
}
@@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
s64 key = entity_key(cfs_rq, se);
cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_slice -= se->slice * weight;
cfs_rq->avg_load -= weight;
}
@@ -4908,6 +4910,30 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
+static inline bool
+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags)
+{
+ u64 now, vdelta;
+ s64 delta;
+
+ if (!(flags & ENQUEUE_WAKEUP))
+ return false;
+
+ if (flags & ENQUEUE_MIGRATED)
+ return true;
+
+ now = rq_clock_task(rq_of(cfs_rq));
+ delta = now - se->exec_start;
+ if (delta < 0)
+ return false;
+
+ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load);
+ if (vdelta < vslice)
+ return false;
+
+ return true;
+}
+
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -4929,6 +4955,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
lag = se->vlag;
+ /*
+ * For latency sensitive tasks; those that have a shorter than
+ * average slice and do not fully consume the slice, transition
+ * to EEVDF placement strategy #2.
+ */
+ if (sched_feat(PLACE_FUDGE) &&
+ (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
+ entity_has_slept(cfs_rq, se, vslice, flags)) {
+ lag += vslice;
+ if (lag > 0)
+ lag = 0;
+ }
+
/*
* If we want to place a task and preserve lag, we have to
* consider the effect of the new entity on the weighted
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 54334ca5c5c61..7d65b40299d91 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -5,6 +5,7 @@
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_FUDGE, true)
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db5853761b1f3..bc45beee335c5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -549,6 +549,7 @@ struct cfs_rq {
unsigned int idle_h_nr_running; /* SCHED_IDLE */
s64 avg_vruntime;
+ u64 avg_slice;
u64 avg_load;
u64 exec_clock;
--
cgit
From 36b9081885fee5764b53970dd2d6afe8c2f13b7f Mon Sep 17 00:00:00 2001
From: Parth Shah <parth@linux.ibm.com>
Date: Sat, 11 Mar 2023 12:20:21 +0100
Subject: sched: Introduce latency-nice as a per-task attribute
Latency-nice indicates the latency requirements of a task with respect
to the other tasks in the system. The value of the attribute can be within
the range of [-20, 19] both inclusive to be in-line with the values just
like task nice values.
Just like task nice, -20 is the 'highest' priority and conveys this
task should get minimal latency, conversely 19 is the lowest priority
and conveys this task will get the least consideration and will thus
receive maximal latency.
[peterz: rebase, squash]
Signed-off-by: Parth Shah <parth@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched.h | 1 +
include/uapi/linux/sched.h | 4 +++-
include/uapi/linux/sched/types.h | 19 +++++++++++++++++++
init/init_task.c | 3 ++-
kernel/sched/core.c | 27 ++++++++++++++++++++++++++-
kernel/sched/debug.c | 1 +
tools/include/uapi/linux/sched.h | 4 +++-
7 files changed, 55 insertions(+), 4 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 177b3f3676ef8..80bb40a63e9aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -790,6 +790,7 @@ struct task_struct {
int static_prio;
int normal_prio;
unsigned int rt_priority;
+ int latency_prio;
struct sched_entity se;
struct sched_rt_entity rt;
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab26..b2e932c25be62 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {
#define SCHED_FLAG_KEEP_PARAMS 0x10
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
+#define SCHED_FLAG_LATENCY_NICE 0x80
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {
SCHED_FLAG_RECLAIM | \
SCHED_FLAG_DL_OVERRUN | \
SCHED_FLAG_KEEP_ALL | \
- SCHED_FLAG_UTIL_CLAMP)
+ SCHED_FLAG_UTIL_CLAMP | \
+ SCHED_FLAG_LATENCY_NICE)
#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index f2c4589d4dbfe..db1e8199e8c80 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -10,6 +10,7 @@ struct sched_param {
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
+#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */
/*
* Extended scheduling parameters data structure.
@@ -98,6 +99,22 @@ struct sched_param {
* scheduled on a CPU with no more capacity than the specified value.
*
* A task utilization boundary can be reset by setting the attribute to -1.
+ *
+ * Latency Tolerance Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes allows to specify the relative latency
+ * requirements of a task with respect to the other tasks running/queued in the
+ * system.
+ *
+ * @ sched_latency_nice task's latency_nice value
+ *
+ * The latency_nice of a task can have any value in a range of
+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
+ *
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
+ * taken for a task requiring a lower latency as opposed to the task with
+ * higher latency_nice.
*/
struct sched_attr {
__u32 size;
@@ -120,6 +137,8 @@ struct sched_attr {
__u32 sched_util_min;
__u32 sched_util_max;
+ /* latency requirement hints */
+ __s32 sched_latency_nice;
};
#endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/init/init_task.c b/init/init_task.c
index ff6c4b9bfe6b1..511cbcf3510dc 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -78,6 +78,7 @@ struct task_struct init_task
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
+ .latency_prio = DEFAULT_PRIO,
.policy = SCHED_NORMAL,
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
@@ -89,7 +90,7 @@ struct task_struct init_task
.fn = do_no_restart_syscall,
},
.se = {
- .group_node = LIST_HEAD_INIT(init_task.se.group_node),
+ .group_node = LIST_HEAD_INIT(init_task.se.group_node),
},
.rt = {
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a5d3422f7d0de..b3533d0d4a2ca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4757,6 +4757,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
+ p->latency_prio = NICE_TO_PRIO(0);
+
/*
* We don't need the reset flag anymore after the fork. It has
* fulfilled its duty:
@@ -7531,7 +7533,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
#define SETPARAM_POLICY -1
static void __setscheduler_params(struct task_struct *p,
- const struct sched_attr *attr)
+ const struct sched_attr *attr)
{
int policy = attr->sched_policy;
@@ -7555,6 +7557,13 @@ static void __setscheduler_params(struct task_struct *p,
set_load_weight(p, true);
}
+static void __setscheduler_latency(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
+ p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
+}
+
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -7689,6 +7698,13 @@ recheck:
return retval;
}
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
+ if (attr->sched_latency_nice > MAX_NICE)
+ return -EINVAL;
+ if (attr->sched_latency_nice < MIN_NICE)
+ return -EINVAL;
+ }
+
/* Update task specific "requested" clamps */
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
retval = uclamp_validate(p, attr);
@@ -7736,6 +7752,9 @@ recheck:
goto change;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
+ attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
+ goto change;
p->sched_reset_on_fork = reset_on_fork;
retval = 0;
@@ -7824,6 +7843,7 @@ change:
__setscheduler_params(p, attr);
__setscheduler_prio(p, newprio);
}
+ __setscheduler_latency(p, attr);
__setscheduler_uclamp(p, attr);
if (queued) {
@@ -8035,6 +8055,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
size < SCHED_ATTR_SIZE_VER1)
return -EINVAL;
+ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
+ size < SCHED_ATTR_SIZE_VER2)
+ return -EINVAL;
/*
* XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
@@ -8272,6 +8295,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
get_params(p, &kattr);
kattr.sched_flags &= SCHED_FLAG_ALL;
+ kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio);
+
#ifdef CONFIG_UCLAMP_TASK
/*
* This could race with another potential updater, but this is fine
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4c3d0d9f3db63..5c743bcb340d2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1086,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
#endif
P(policy);
P(prio);
+ P(latency_prio);
if (task_has_dl_policy(p)) {
P(dl.runtime);
P(dl.deadline);
diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
index 3bac0a8ceab26..b2e932c25be62 100644
--- a/tools/include/uapi/linux/sched.h
+++ b/tools/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {
#define SCHED_FLAG_KEEP_PARAMS 0x10
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
+#define SCHED_FLAG_LATENCY_NICE 0x80
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {
SCHED_FLAG_RECLAIM | \
SCHED_FLAG_DL_OVERRUN | \
SCHED_FLAG_KEEP_ALL | \
- SCHED_FLAG_UTIL_CLAMP)
+ SCHED_FLAG_UTIL_CLAMP | \
+ SCHED_FLAG_LATENCY_NICE)
#endif /* _UAPI_LINUX_SCHED_H */
--
cgit
From 9f9a3323112d3aa5afa466b1e391e137f28dc79d Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Fri, 24 Feb 2023 10:34:51 +0100
Subject: sched/fair: Implement latency-nice
Implement latency-nice as a modulation of the EEVDF r_i parameter,
specifically apply the inverse sched_prio_to_weight[] relation on
base_slice.
Given a base slice of 3 [ms], this gives a range of:
latency-nice 19: 3*1024 / 15 ~= 204.8 [ms]
latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms]
(which might not make sense)
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
kernel/sched/core.c | 14 ++++++++++----
kernel/sched/fair.c | 22 +++++++++++++++-------
kernel/sched/sched.h | 2 ++
3 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b3533d0d4a2ca..263caac8f76b7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
}
+static inline void set_latency_prio(struct task_struct *p, int prio)
+{
+ p->latency_prio = prio;
+ set_latency_fair(&p->se, prio - MAX_RT_PRIO);
+}
+
#ifdef CONFIG_UCLAMP_TASK
/*
* Serializes updates of utilization clamp values
@@ -4502,9 +4508,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
- p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
+ set_latency_prio(p, p->latency_prio);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
#endif
@@ -4756,8 +4763,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
-
- p->latency_prio = NICE_TO_PRIO(0);
+ set_latency_prio(p, NICE_TO_PRIO(0));
/*
* We don't need the reset flag anymore after the fork. It has
@@ -7561,7 +7567,7 @@ static void __setscheduler_latency(struct task_struct *p,
const struct sched_attr *attr)
{
if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
- p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
+ set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 16949f7bbb172..c2019e7d46cf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -952,6 +952,21 @@ int sched_update_scaling(void)
}
#endif
+void set_latency_fair(struct sched_entity *se, int prio)
+{
+ u32 weight = sched_prio_to_weight[prio];
+ u64 base = sysctl_sched_base_slice;
+
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * latency-nice.
+ *
+ * Smaller request gets better latency.
+ */
+ se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
+}
+
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
/*
@@ -963,13 +978,6 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
if ((s64)(se->vruntime - se->deadline) < 0)
return;
- /*
- * For EEVDF the virtual time slope is determined by w_i (iow.
- * nice) while the request time r_i is determined by
- * sysctl_sched_base_slice.
- */
- se->slice = sysctl_sched_base_slice;
-
/*
* EEVDF: vd_i = ve_i + r_i / w_i
*/
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bc45beee335c5..8f8d903a01892 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2520,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_hot_threshold;
#endif
+extern void set_latency_fair(struct sched_entity *se, int prio);
+
#ifdef CONFIG_SCHED_HRTICK
/*
--
cgit
From a317f35154852bc023a7ab2e3fa491e1897af72f Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 24 Feb 2023 10:34:52 +0100
Subject: sched/fair: Add sched group latency support
Task can set its latency priority with sched_setattr(), which is then used
to set the latency offset of its sched_enity, but sched group entities
still have the default latency offset value.
Add a latency.nice field in cpu cgroup controller to set the latency
priority of the group similarly to sched_setattr(). The latency priority
is then used to set the offset of the sched_entities of the group.
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20230224093454.956298-7-vincent.guittot@linaro.org
---
Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++
kernel/sched/core.c | 30 ++++++++++++++++++++++++++++++
kernel/sched/fair.c | 27 +++++++++++++++++++++++++++
kernel/sched/sched.h | 4 ++++
4 files changed, 71 insertions(+)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 4ef8901911961..3a8d3e1e55910 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
values similar to the sched_setattr(2). This maximum utilization
value is used to clamp the task specific maximum utilization clamp.
+ cpu.latency.nice
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ The nice value is in the range [-20, 19].
+
+ This interface file allows reading and setting latency using the
+ same values used by sched_setattr(2). The latency_nice of a group is
+ used to limit the impact of the latency_nice of a task outside the
+ group.
Memory
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 263caac8f76b7..8a541fe2d4626 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11247,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
{
return sched_group_set_idle(css_tg(css), idle);
}
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return PRIO_TO_NICE(css_tg(css)->latency_prio);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ int prio;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -ERANGE;
+
+ prio = NICE_TO_PRIO(nice);
+
+ return sched_group_set_latency(css_tg(css), prio);
+}
#endif
static struct cftype cpu_legacy_files[] = {
@@ -11261,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency.nice",
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -11500,6 +11524,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c2019e7d46cf5..8a4799c600309 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12545,6 +12545,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
goto err;
tg->shares = NICE_0_LOAD;
+ tg->latency_prio = DEFAULT_PRIO;
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
@@ -12643,6 +12644,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
}
se->my_q = cfs_rq;
+
+ set_latency_fair(se, tg->latency_prio - MAX_RT_PRIO);
+
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
@@ -12773,6 +12777,29 @@ next_cpu:
return 0;
}
+int sched_group_set_latency(struct task_group *tg, int prio)
+{
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->latency_prio == prio) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->latency_prio = prio;
+
+ for_each_possible_cpu(i)
+ set_latency_fair(tg->se[i], prio - MAX_RT_PRIO);
+
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8f8d903a01892..4236c4c893aa7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -372,6 +372,8 @@ struct task_group {
/* A positive value indicates that this is a SCHED_IDLE group. */
int idle;
+ /* latency priority of the group. */
+ int latency_prio;
#ifdef CONFIG_SMP
/*
@@ -482,6 +484,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern int sched_group_set_idle(struct task_group *tg, long idle);
+extern int sched_group_set_latency(struct task_group *tg, int prio);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
--
cgit
From b412068f928064d23f67709f46d36d7659079e54 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 May 2023 13:46:30 +0200
Subject: sched/eevdf: Use sched_attr::sched_runtime to set request/slice
As an alternative to the latency-nice interface; allow applications to
directly set the request/slice using sched_attr::sched_runtime.
The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
Applications should strive to use their periodic runtime at a high
confidence interval (95%+) as the target slice. Using a smaller slice
will introduce undue preemptions, while using a larger value will
increase latency.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a541fe2d4626..5b71c398f6cf6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7548,10 +7548,18 @@ static void __setscheduler_params(struct task_struct *p,
p->policy = policy;
- if (dl_policy(policy))
+ if (dl_policy(policy)) {
__setparam_dl(p, attr);
- else if (fair_policy(policy))
+ } else if (fair_policy(policy)) {
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ p->se.slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ p->se.slice = sysctl_sched_base_slice;
+ }
+ }
/*
* __sched_setscheduler() ensures attr->sched_priority == 0 when
@@ -7750,7 +7758,9 @@ recheck:
* but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ if (fair_policy(policy) &&
+ (attr->sched_nice != task_nice(p) ||
+ (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
@@ -8079,12 +8089,14 @@ err_size:
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
- if (task_has_dl_policy(p))
+ if (task_has_dl_policy(p)) {
__getparam_dl(p, attr);
- else if (task_has_rt_policy(p))
+ } else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
- else
+ } else {
attr->sched_nice = task_nice(p);
+ attr->sched_runtime = p->se.slice;
+ }
}
/**
--
cgit
From 2f88c8e802c8b128a155976631f4eb2ce4f3c805 Mon Sep 17 00:00:00 2001
From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Date: Thu, 24 Aug 2023 13:33:42 +0530
Subject: sched/eevdf/doc: Modify the documented knob to base_slice_ns as well
After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns'
sysctl to 'base_slice_ns':
e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
... but we forgot to rename it in the documentation. Do that now.
Fixes: e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230824080342.543396-1-sshegde@linux.vnet.ibm.com
---
Documentation/scheduler/sched-design-CFS.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
index 03db555045151..f68919800f050 100644
--- a/Documentation/scheduler/sched-design-CFS.rst
+++ b/Documentation/scheduler/sched-design-CFS.rst
@@ -94,7 +94,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
way the previous scheduler had, and has no heuristics whatsoever. There is
only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
- /sys/kernel/debug/sched/min_granularity_ns
+ /sys/kernel/debug/sched/base_slice_ns
which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
"server" (i.e., good batching) workloads. It defaults to a setting suitable
--
cgit
From 63304558ba5dcaaff9e052ee43cfdcc7f9c29e85 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Aug 2023 15:40:59 +0200
Subject: sched/eevdf: Curb wakeup-preemption
Mike and others noticed that EEVDF does like to over-schedule quite a
bit -- which does hurt performance of a number of benchmarks /
workloads.
In particular, what seems to cause over-scheduling is that when lag is
of the same order (or larger) than the request / slice then placement
will not only cause the task to be placed left of current, but also
with a smaller deadline than current, which causes immediate
preemption.
[ notably, lag bounds are relative to HZ ]
Mike suggested we stick to picking 'current' for as long as it's
eligible to run, giving it uninterrupted runtime until it reaches
parity with the pack.
Augment Mike's suggestion by only allowing it to exhaust it's initial
request.
One random data point:
echo NO_RUN_TO_PARITY > /debug/sched/features
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
3,723,554 context-switches ( +- 0.56% )
9.5136 +- 0.0394 seconds time elapsed ( +- 0.41% )
echo RUN_TO_PARITY > /debug/sched/features
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
2,556,535 context-switches ( +- 0.51% )
9.2427 +- 0.0302 seconds time elapsed ( +- 0.33% )
Suggested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net
---
kernel/sched/fair.c | 12 ++++++++++++
kernel/sched/features.h | 1 +
2 files changed, 13 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f496cef90ce77..0b7445cd5af98 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -873,6 +873,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
+ /*
+ * Once selected, run a task until it either becomes non-eligible or
+ * until it gets a new slice. See the HACK in set_next_entity().
+ */
+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ return curr;
+
while (node) {
struct sched_entity *se = __node_2_se(node);
@@ -5167,6 +5174,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
+ /*
+ * HACK, stash a copy of deadline at the point of pick in vlag,
+ * which isn't used until dequeue.
+ */
+ se->vlag = se->deadline;
}
update_stats_curr_start(cfs_rq, se);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 61bcbf5e46a45..f770168230ae4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -6,6 +6,7 @@
*/
SCHED_FEAT(PLACE_LAG, true)
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+SCHED_FEAT(RUN_TO_PARITY, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
--
cgit

View File

@@ -2756,3 +2756,885 @@ index 7ff9965570e69..db5853761b1f3 100644
--
cgit
From 246c6d7ab4d042b185d7df71f437137d43cbb83a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 25 Mar 2023 00:14:04 +0100
Subject: sched/eevdf: Better handle mixed slice length
In the case where (due to latency-nice) there are different request
sizes in the tree, the smaller requests tend to be dominated by the
larger. Also note how the EEVDF lag limits are based on r_max.
Therefore; add a heuristic that for the mixed request size case, moves
smaller requests to placement strategy #2 which ensures they're
immidiately eligible and and due to their smaller (virtual) deadline
will cause preemption.
NOTE: this relies on update_entity_lag() to impose lag limits above
a single slice.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++++
kernel/sched/features.h | 1 +
kernel/sched/sched.h | 1 +
3 files changed, 41 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c8c9f7d8496a..16949f7bbb172 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
s64 key = entity_key(cfs_rq, se);
cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_slice += se->slice * weight;
cfs_rq->avg_load += weight;
}
@@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
s64 key = entity_key(cfs_rq, se);
cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_slice -= se->slice * weight;
cfs_rq->avg_load -= weight;
}
@@ -4908,6 +4910,30 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
+static inline bool
+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags)
+{
+ u64 now, vdelta;
+ s64 delta;
+
+ if (!(flags & ENQUEUE_WAKEUP))
+ return false;
+
+ if (flags & ENQUEUE_MIGRATED)
+ return true;
+
+ now = rq_clock_task(rq_of(cfs_rq));
+ delta = now - se->exec_start;
+ if (delta < 0)
+ return false;
+
+ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load);
+ if (vdelta < vslice)
+ return false;
+
+ return true;
+}
+
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -4929,6 +4955,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
lag = se->vlag;
+ /*
+ * For latency sensitive tasks; those that have a shorter than
+ * average slice and do not fully consume the slice, transition
+ * to EEVDF placement strategy #2.
+ */
+ if (sched_feat(PLACE_FUDGE) &&
+ (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
+ entity_has_slept(cfs_rq, se, vslice, flags)) {
+ lag += vslice;
+ if (lag > 0)
+ lag = 0;
+ }
+
/*
* If we want to place a task and preserve lag, we have to
* consider the effect of the new entity on the weighted
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 54334ca5c5c61..7d65b40299d91 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -5,6 +5,7 @@
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_FUDGE, true)
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db5853761b1f3..bc45beee335c5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -549,6 +549,7 @@ struct cfs_rq {
unsigned int idle_h_nr_running; /* SCHED_IDLE */
s64 avg_vruntime;
+ u64 avg_slice;
u64 avg_load;
u64 exec_clock;
--
cgit
From 36b9081885fee5764b53970dd2d6afe8c2f13b7f Mon Sep 17 00:00:00 2001
From: Parth Shah <parth@linux.ibm.com>
Date: Sat, 11 Mar 2023 12:20:21 +0100
Subject: sched: Introduce latency-nice as a per-task attribute
Latency-nice indicates the latency requirements of a task with respect
to the other tasks in the system. The value of the attribute can be within
the range of [-20, 19] both inclusive to be in-line with the values just
like task nice values.
Just like task nice, -20 is the 'highest' priority and conveys this
task should get minimal latency, conversely 19 is the lowest priority
and conveys this task will get the least consideration and will thus
receive maximal latency.
[peterz: rebase, squash]
Signed-off-by: Parth Shah <parth@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched.h | 1 +
include/uapi/linux/sched.h | 4 +++-
include/uapi/linux/sched/types.h | 19 +++++++++++++++++++
init/init_task.c | 3 ++-
kernel/sched/core.c | 27 ++++++++++++++++++++++++++-
kernel/sched/debug.c | 1 +
tools/include/uapi/linux/sched.h | 4 +++-
7 files changed, 55 insertions(+), 4 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 177b3f3676ef8..80bb40a63e9aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -790,6 +790,7 @@ struct task_struct {
int static_prio;
int normal_prio;
unsigned int rt_priority;
+ int latency_prio;
struct sched_entity se;
struct sched_rt_entity rt;
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab26..b2e932c25be62 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {
#define SCHED_FLAG_KEEP_PARAMS 0x10
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
+#define SCHED_FLAG_LATENCY_NICE 0x80
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {
SCHED_FLAG_RECLAIM | \
SCHED_FLAG_DL_OVERRUN | \
SCHED_FLAG_KEEP_ALL | \
- SCHED_FLAG_UTIL_CLAMP)
+ SCHED_FLAG_UTIL_CLAMP | \
+ SCHED_FLAG_LATENCY_NICE)
#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index f2c4589d4dbfe..db1e8199e8c80 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -10,6 +10,7 @@ struct sched_param {
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
+#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */
/*
* Extended scheduling parameters data structure.
@@ -98,6 +99,22 @@ struct sched_param {
* scheduled on a CPU with no more capacity than the specified value.
*
* A task utilization boundary can be reset by setting the attribute to -1.
+ *
+ * Latency Tolerance Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes allows to specify the relative latency
+ * requirements of a task with respect to the other tasks running/queued in the
+ * system.
+ *
+ * @ sched_latency_nice task's latency_nice value
+ *
+ * The latency_nice of a task can have any value in a range of
+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
+ *
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
+ * taken for a task requiring a lower latency as opposed to the task with
+ * higher latency_nice.
*/
struct sched_attr {
__u32 size;
@@ -120,6 +137,8 @@ struct sched_attr {
__u32 sched_util_min;
__u32 sched_util_max;
+ /* latency requirement hints */
+ __s32 sched_latency_nice;
};
#endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/init/init_task.c b/init/init_task.c
index ff6c4b9bfe6b1..511cbcf3510dc 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -78,6 +78,7 @@ struct task_struct init_task
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
+ .latency_prio = DEFAULT_PRIO,
.policy = SCHED_NORMAL,
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
@@ -89,7 +90,7 @@ struct task_struct init_task
.fn = do_no_restart_syscall,
},
.se = {
- .group_node = LIST_HEAD_INIT(init_task.se.group_node),
+ .group_node = LIST_HEAD_INIT(init_task.se.group_node),
},
.rt = {
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a5d3422f7d0de..b3533d0d4a2ca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4757,6 +4757,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
+ p->latency_prio = NICE_TO_PRIO(0);
+
/*
* We don't need the reset flag anymore after the fork. It has
* fulfilled its duty:
@@ -7531,7 +7533,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
#define SETPARAM_POLICY -1
static void __setscheduler_params(struct task_struct *p,
- const struct sched_attr *attr)
+ const struct sched_attr *attr)
{
int policy = attr->sched_policy;
@@ -7555,6 +7557,13 @@ static void __setscheduler_params(struct task_struct *p,
set_load_weight(p, true);
}
+static void __setscheduler_latency(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
+ p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
+}
+
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -7689,6 +7698,13 @@ recheck:
return retval;
}
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
+ if (attr->sched_latency_nice > MAX_NICE)
+ return -EINVAL;
+ if (attr->sched_latency_nice < MIN_NICE)
+ return -EINVAL;
+ }
+
/* Update task specific "requested" clamps */
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
retval = uclamp_validate(p, attr);
@@ -7736,6 +7752,9 @@ recheck:
goto change;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
+ attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
+ goto change;
p->sched_reset_on_fork = reset_on_fork;
retval = 0;
@@ -7824,6 +7843,7 @@ change:
__setscheduler_params(p, attr);
__setscheduler_prio(p, newprio);
}
+ __setscheduler_latency(p, attr);
__setscheduler_uclamp(p, attr);
if (queued) {
@@ -8035,6 +8055,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
size < SCHED_ATTR_SIZE_VER1)
return -EINVAL;
+ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
+ size < SCHED_ATTR_SIZE_VER2)
+ return -EINVAL;
/*
* XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
@@ -8272,6 +8295,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
get_params(p, &kattr);
kattr.sched_flags &= SCHED_FLAG_ALL;
+ kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio);
+
#ifdef CONFIG_UCLAMP_TASK
/*
* This could race with another potential updater, but this is fine
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4c3d0d9f3db63..5c743bcb340d2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1086,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
#endif
P(policy);
P(prio);
+ P(latency_prio);
if (task_has_dl_policy(p)) {
P(dl.runtime);
P(dl.deadline);
diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
index 3bac0a8ceab26..b2e932c25be62 100644
--- a/tools/include/uapi/linux/sched.h
+++ b/tools/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {
#define SCHED_FLAG_KEEP_PARAMS 0x10
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
+#define SCHED_FLAG_LATENCY_NICE 0x80
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {
SCHED_FLAG_RECLAIM | \
SCHED_FLAG_DL_OVERRUN | \
SCHED_FLAG_KEEP_ALL | \
- SCHED_FLAG_UTIL_CLAMP)
+ SCHED_FLAG_UTIL_CLAMP | \
+ SCHED_FLAG_LATENCY_NICE)
#endif /* _UAPI_LINUX_SCHED_H */
--
cgit
From 9f9a3323112d3aa5afa466b1e391e137f28dc79d Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Fri, 24 Feb 2023 10:34:51 +0100
Subject: sched/fair: Implement latency-nice
Implement latency-nice as a modulation of the EEVDF r_i parameter,
specifically apply the inverse sched_prio_to_weight[] relation on
base_slice.
Given a base slice of 3 [ms], this gives a range of:
latency-nice 19: 3*1024 / 15 ~= 204.8 [ms]
latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms]
(which might not make sense)
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
kernel/sched/core.c | 14 ++++++++++----
kernel/sched/fair.c | 22 +++++++++++++++-------
kernel/sched/sched.h | 2 ++
3 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b3533d0d4a2ca..263caac8f76b7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
}
+static inline void set_latency_prio(struct task_struct *p, int prio)
+{
+ p->latency_prio = prio;
+ set_latency_fair(&p->se, prio - MAX_RT_PRIO);
+}
+
#ifdef CONFIG_UCLAMP_TASK
/*
* Serializes updates of utilization clamp values
@@ -4502,9 +4508,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
- p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
+ set_latency_prio(p, p->latency_prio);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
#endif
@@ -4756,8 +4763,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
-
- p->latency_prio = NICE_TO_PRIO(0);
+ set_latency_prio(p, NICE_TO_PRIO(0));
/*
* We don't need the reset flag anymore after the fork. It has
@@ -7561,7 +7567,7 @@ static void __setscheduler_latency(struct task_struct *p,
const struct sched_attr *attr)
{
if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
- p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
+ set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 16949f7bbb172..c2019e7d46cf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -952,6 +952,21 @@ int sched_update_scaling(void)
}
#endif
+void set_latency_fair(struct sched_entity *se, int prio)
+{
+ u32 weight = sched_prio_to_weight[prio];
+ u64 base = sysctl_sched_base_slice;
+
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * latency-nice.
+ *
+ * Smaller request gets better latency.
+ */
+ se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
+}
+
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
/*
@@ -963,13 +978,6 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
if ((s64)(se->vruntime - se->deadline) < 0)
return;
- /*
- * For EEVDF the virtual time slope is determined by w_i (iow.
- * nice) while the request time r_i is determined by
- * sysctl_sched_base_slice.
- */
- se->slice = sysctl_sched_base_slice;
-
/*
* EEVDF: vd_i = ve_i + r_i / w_i
*/
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bc45beee335c5..8f8d903a01892 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2520,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_hot_threshold;
#endif
+extern void set_latency_fair(struct sched_entity *se, int prio);
+
#ifdef CONFIG_SCHED_HRTICK
/*
--
cgit
From a317f35154852bc023a7ab2e3fa491e1897af72f Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 24 Feb 2023 10:34:52 +0100
Subject: sched/fair: Add sched group latency support
Task can set its latency priority with sched_setattr(), which is then used
to set the latency offset of its sched_enity, but sched group entities
still have the default latency offset value.
Add a latency.nice field in cpu cgroup controller to set the latency
priority of the group similarly to sched_setattr(). The latency priority
is then used to set the offset of the sched_entities of the group.
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20230224093454.956298-7-vincent.guittot@linaro.org
---
Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++
kernel/sched/core.c | 30 ++++++++++++++++++++++++++++++
kernel/sched/fair.c | 27 +++++++++++++++++++++++++++
kernel/sched/sched.h | 4 ++++
4 files changed, 71 insertions(+)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 4ef8901911961..3a8d3e1e55910 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
values similar to the sched_setattr(2). This maximum utilization
value is used to clamp the task specific maximum utilization clamp.
+ cpu.latency.nice
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ The nice value is in the range [-20, 19].
+
+ This interface file allows reading and setting latency using the
+ same values used by sched_setattr(2). The latency_nice of a group is
+ used to limit the impact of the latency_nice of a task outside the
+ group.
Memory
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 263caac8f76b7..8a541fe2d4626 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11247,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
{
return sched_group_set_idle(css_tg(css), idle);
}
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return PRIO_TO_NICE(css_tg(css)->latency_prio);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ int prio;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -ERANGE;
+
+ prio = NICE_TO_PRIO(nice);
+
+ return sched_group_set_latency(css_tg(css), prio);
+}
#endif
static struct cftype cpu_legacy_files[] = {
@@ -11261,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency.nice",
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -11500,6 +11524,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c2019e7d46cf5..8a4799c600309 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12545,6 +12545,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
goto err;
tg->shares = NICE_0_LOAD;
+ tg->latency_prio = DEFAULT_PRIO;
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
@@ -12643,6 +12644,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
}
se->my_q = cfs_rq;
+
+ set_latency_fair(se, tg->latency_prio - MAX_RT_PRIO);
+
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
@@ -12773,6 +12777,29 @@ next_cpu:
return 0;
}
+int sched_group_set_latency(struct task_group *tg, int prio)
+{
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->latency_prio == prio) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->latency_prio = prio;
+
+ for_each_possible_cpu(i)
+ set_latency_fair(tg->se[i], prio - MAX_RT_PRIO);
+
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8f8d903a01892..4236c4c893aa7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -372,6 +372,8 @@ struct task_group {
/* A positive value indicates that this is a SCHED_IDLE group. */
int idle;
+ /* latency priority of the group. */
+ int latency_prio;
#ifdef CONFIG_SMP
/*
@@ -482,6 +484,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern int sched_group_set_idle(struct task_group *tg, long idle);
+extern int sched_group_set_latency(struct task_group *tg, int prio);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
--
cgit
From b412068f928064d23f67709f46d36d7659079e54 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 May 2023 13:46:30 +0200
Subject: sched/eevdf: Use sched_attr::sched_runtime to set request/slice
As an alternative to the latency-nice interface; allow applications to
directly set the request/slice using sched_attr::sched_runtime.
The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
Applications should strive to use their periodic runtime at a high
confidence interval (95%+) as the target slice. Using a smaller slice
will introduce undue preemptions, while using a larger value will
increase latency.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a541fe2d4626..5b71c398f6cf6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7548,10 +7548,18 @@ static void __setscheduler_params(struct task_struct *p,
p->policy = policy;
- if (dl_policy(policy))
+ if (dl_policy(policy)) {
__setparam_dl(p, attr);
- else if (fair_policy(policy))
+ } else if (fair_policy(policy)) {
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ p->se.slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ p->se.slice = sysctl_sched_base_slice;
+ }
+ }
/*
* __sched_setscheduler() ensures attr->sched_priority == 0 when
@@ -7750,7 +7758,9 @@ recheck:
* but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ if (fair_policy(policy) &&
+ (attr->sched_nice != task_nice(p) ||
+ (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
@@ -8079,12 +8089,14 @@ err_size:
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
- if (task_has_dl_policy(p))
+ if (task_has_dl_policy(p)) {
__getparam_dl(p, attr);
- else if (task_has_rt_policy(p))
+ } else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
- else
+ } else {
attr->sched_nice = task_nice(p);
+ attr->sched_runtime = p->se.slice;
+ }
}
/**
--
cgit
From 2f88c8e802c8b128a155976631f4eb2ce4f3c805 Mon Sep 17 00:00:00 2001
From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Date: Thu, 24 Aug 2023 13:33:42 +0530
Subject: sched/eevdf/doc: Modify the documented knob to base_slice_ns as well
After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns'
sysctl to 'base_slice_ns':
e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
... but we forgot to rename it in the documentation. Do that now.
Fixes: e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230824080342.543396-1-sshegde@linux.vnet.ibm.com
---
Documentation/scheduler/sched-design-CFS.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
index 03db555045151..f68919800f050 100644
--- a/Documentation/scheduler/sched-design-CFS.rst
+++ b/Documentation/scheduler/sched-design-CFS.rst
@@ -94,7 +94,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
way the previous scheduler had, and has no heuristics whatsoever. There is
only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
- /sys/kernel/debug/sched/min_granularity_ns
+ /sys/kernel/debug/sched/base_slice_ns
which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
"server" (i.e., good batching) workloads. It defaults to a setting suitable
--
cgit
From 63304558ba5dcaaff9e052ee43cfdcc7f9c29e85 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Aug 2023 15:40:59 +0200
Subject: sched/eevdf: Curb wakeup-preemption
Mike and others noticed that EEVDF does like to over-schedule quite a
bit -- which does hurt performance of a number of benchmarks /
workloads.
In particular, what seems to cause over-scheduling is that when lag is
of the same order (or larger) than the request / slice then placement
will not only cause the task to be placed left of current, but also
with a smaller deadline than current, which causes immediate
preemption.
[ notably, lag bounds are relative to HZ ]
Mike suggested we stick to picking 'current' for as long as it's
eligible to run, giving it uninterrupted runtime until it reaches
parity with the pack.
Augment Mike's suggestion by only allowing it to exhaust it's initial
request.
One random data point:
echo NO_RUN_TO_PARITY > /debug/sched/features
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
3,723,554 context-switches ( +- 0.56% )
9.5136 +- 0.0394 seconds time elapsed ( +- 0.41% )
echo RUN_TO_PARITY > /debug/sched/features
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
2,556,535 context-switches ( +- 0.51% )
9.2427 +- 0.0302 seconds time elapsed ( +- 0.33% )
Suggested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net
---
kernel/sched/fair.c | 12 ++++++++++++
kernel/sched/features.h | 1 +
2 files changed, 13 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f496cef90ce77..0b7445cd5af98 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -873,6 +873,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
+ /*
+ * Once selected, run a task until it either becomes non-eligible or
+ * until it gets a new slice. See the HACK in set_next_entity().
+ */
+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ return curr;
+
while (node) {
struct sched_entity *se = __node_2_se(node);
@@ -5167,6 +5174,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
+ /*
+ * HACK, stash a copy of deadline at the point of pick in vlag,
+ * which isn't used until dequeue.
+ */
+ se->vlag = se->deadline;
}
update_stats_curr_start(cfs_rq, se);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 61bcbf5e46a45..f770168230ae4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -6,6 +6,7 @@
*/
SCHED_FEAT(PLACE_LAG, true)
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+SCHED_FEAT(RUN_TO_PARITY, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
--
cgit