Update EEVDF patches (#802)
This commit is contained in:
@@ -2756,3 +2756,885 @@ index 7ff9965570e69..db5853761b1f3 100644
|
||||
--
|
||||
cgit
|
||||
|
||||
From 246c6d7ab4d042b185d7df71f437137d43cbb83a Mon Sep 17 00:00:00 2001
|
||||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Sat, 25 Mar 2023 00:14:04 +0100
|
||||
Subject: sched/eevdf: Better handle mixed slice length
|
||||
|
||||
In the case where (due to latency-nice) there are different request
|
||||
sizes in the tree, the smaller requests tend to be dominated by the
|
||||
larger. Also note how the EEVDF lag limits are based on r_max.
|
||||
|
||||
Therefore; add a heuristic that for the mixed request size case, moves
|
||||
smaller requests to placement strategy #2 which ensures they're
|
||||
immidiately eligible and and due to their smaller (virtual) deadline
|
||||
will cause preemption.
|
||||
|
||||
NOTE: this relies on update_entity_lag() to impose lag limits above
|
||||
a single slice.
|
||||
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
---
|
||||
kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++++
|
||||
kernel/sched/features.h | 1 +
|
||||
kernel/sched/sched.h | 1 +
|
||||
3 files changed, 41 insertions(+)
|
||||
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index 5c8c9f7d8496a..16949f7bbb172 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
s64 key = entity_key(cfs_rq, se);
|
||||
|
||||
cfs_rq->avg_vruntime += key * weight;
|
||||
+ cfs_rq->avg_slice += se->slice * weight;
|
||||
cfs_rq->avg_load += weight;
|
||||
}
|
||||
|
||||
@@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
s64 key = entity_key(cfs_rq, se);
|
||||
|
||||
cfs_rq->avg_vruntime -= key * weight;
|
||||
+ cfs_rq->avg_slice -= se->slice * weight;
|
||||
cfs_rq->avg_load -= weight;
|
||||
}
|
||||
|
||||
@@ -4908,6 +4910,30 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
+static inline bool
|
||||
+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags)
|
||||
+{
|
||||
+ u64 now, vdelta;
|
||||
+ s64 delta;
|
||||
+
|
||||
+ if (!(flags & ENQUEUE_WAKEUP))
|
||||
+ return false;
|
||||
+
|
||||
+ if (flags & ENQUEUE_MIGRATED)
|
||||
+ return true;
|
||||
+
|
||||
+ now = rq_clock_task(rq_of(cfs_rq));
|
||||
+ delta = now - se->exec_start;
|
||||
+ if (delta < 0)
|
||||
+ return false;
|
||||
+
|
||||
+ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load);
|
||||
+ if (vdelta < vslice)
|
||||
+ return false;
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static void
|
||||
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
{
|
||||
@@ -4929,6 +4955,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
|
||||
lag = se->vlag;
|
||||
|
||||
+ /*
|
||||
+ * For latency sensitive tasks; those that have a shorter than
|
||||
+ * average slice and do not fully consume the slice, transition
|
||||
+ * to EEVDF placement strategy #2.
|
||||
+ */
|
||||
+ if (sched_feat(PLACE_FUDGE) &&
|
||||
+ (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
|
||||
+ entity_has_slept(cfs_rq, se, vslice, flags)) {
|
||||
+ lag += vslice;
|
||||
+ if (lag > 0)
|
||||
+ lag = 0;
|
||||
+ }
|
||||
+
|
||||
/*
|
||||
* If we want to place a task and preserve lag, we have to
|
||||
* consider the effect of the new entity on the weighted
|
||||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
||||
index 54334ca5c5c61..7d65b40299d91 100644
|
||||
--- a/kernel/sched/features.h
|
||||
+++ b/kernel/sched/features.h
|
||||
@@ -5,6 +5,7 @@
|
||||
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
|
||||
*/
|
||||
SCHED_FEAT(PLACE_LAG, true)
|
||||
+SCHED_FEAT(PLACE_FUDGE, true)
|
||||
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
|
||||
|
||||
/*
|
||||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
||||
index db5853761b1f3..bc45beee335c5 100644
|
||||
--- a/kernel/sched/sched.h
|
||||
+++ b/kernel/sched/sched.h
|
||||
@@ -549,6 +549,7 @@ struct cfs_rq {
|
||||
unsigned int idle_h_nr_running; /* SCHED_IDLE */
|
||||
|
||||
s64 avg_vruntime;
|
||||
+ u64 avg_slice;
|
||||
u64 avg_load;
|
||||
|
||||
u64 exec_clock;
|
||||
--
|
||||
cgit
|
||||
|
||||
From 36b9081885fee5764b53970dd2d6afe8c2f13b7f Mon Sep 17 00:00:00 2001
|
||||
From: Parth Shah <parth@linux.ibm.com>
|
||||
Date: Sat, 11 Mar 2023 12:20:21 +0100
|
||||
Subject: sched: Introduce latency-nice as a per-task attribute
|
||||
|
||||
Latency-nice indicates the latency requirements of a task with respect
|
||||
to the other tasks in the system. The value of the attribute can be within
|
||||
the range of [-20, 19] both inclusive to be in-line with the values just
|
||||
like task nice values.
|
||||
|
||||
Just like task nice, -20 is the 'highest' priority and conveys this
|
||||
task should get minimal latency, conversely 19 is the lowest priority
|
||||
and conveys this task will get the least consideration and will thus
|
||||
receive maximal latency.
|
||||
|
||||
[peterz: rebase, squash]
|
||||
Signed-off-by: Parth Shah <parth@linux.ibm.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
---
|
||||
include/linux/sched.h | 1 +
|
||||
include/uapi/linux/sched.h | 4 +++-
|
||||
include/uapi/linux/sched/types.h | 19 +++++++++++++++++++
|
||||
init/init_task.c | 3 ++-
|
||||
kernel/sched/core.c | 27 ++++++++++++++++++++++++++-
|
||||
kernel/sched/debug.c | 1 +
|
||||
tools/include/uapi/linux/sched.h | 4 +++-
|
||||
7 files changed, 55 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
||||
index 177b3f3676ef8..80bb40a63e9aa 100644
|
||||
--- a/include/linux/sched.h
|
||||
+++ b/include/linux/sched.h
|
||||
@@ -790,6 +790,7 @@ struct task_struct {
|
||||
int static_prio;
|
||||
int normal_prio;
|
||||
unsigned int rt_priority;
|
||||
+ int latency_prio;
|
||||
|
||||
struct sched_entity se;
|
||||
struct sched_rt_entity rt;
|
||||
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
|
||||
index 3bac0a8ceab26..b2e932c25be62 100644
|
||||
--- a/include/uapi/linux/sched.h
|
||||
+++ b/include/uapi/linux/sched.h
|
||||
@@ -132,6 +132,7 @@ struct clone_args {
|
||||
#define SCHED_FLAG_KEEP_PARAMS 0x10
|
||||
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
|
||||
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
|
||||
+#define SCHED_FLAG_LATENCY_NICE 0x80
|
||||
|
||||
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
|
||||
SCHED_FLAG_KEEP_PARAMS)
|
||||
@@ -143,6 +144,7 @@ struct clone_args {
|
||||
SCHED_FLAG_RECLAIM | \
|
||||
SCHED_FLAG_DL_OVERRUN | \
|
||||
SCHED_FLAG_KEEP_ALL | \
|
||||
- SCHED_FLAG_UTIL_CLAMP)
|
||||
+ SCHED_FLAG_UTIL_CLAMP | \
|
||||
+ SCHED_FLAG_LATENCY_NICE)
|
||||
|
||||
#endif /* _UAPI_LINUX_SCHED_H */
|
||||
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
|
||||
index f2c4589d4dbfe..db1e8199e8c80 100644
|
||||
--- a/include/uapi/linux/sched/types.h
|
||||
+++ b/include/uapi/linux/sched/types.h
|
||||
@@ -10,6 +10,7 @@ struct sched_param {
|
||||
|
||||
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
|
||||
#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
|
||||
+#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */
|
||||
|
||||
/*
|
||||
* Extended scheduling parameters data structure.
|
||||
@@ -98,6 +99,22 @@ struct sched_param {
|
||||
* scheduled on a CPU with no more capacity than the specified value.
|
||||
*
|
||||
* A task utilization boundary can be reset by setting the attribute to -1.
|
||||
+ *
|
||||
+ * Latency Tolerance Attributes
|
||||
+ * ===========================
|
||||
+ *
|
||||
+ * A subset of sched_attr attributes allows to specify the relative latency
|
||||
+ * requirements of a task with respect to the other tasks running/queued in the
|
||||
+ * system.
|
||||
+ *
|
||||
+ * @ sched_latency_nice task's latency_nice value
|
||||
+ *
|
||||
+ * The latency_nice of a task can have any value in a range of
|
||||
+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
|
||||
+ *
|
||||
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
|
||||
+ * taken for a task requiring a lower latency as opposed to the task with
|
||||
+ * higher latency_nice.
|
||||
*/
|
||||
struct sched_attr {
|
||||
__u32 size;
|
||||
@@ -120,6 +137,8 @@ struct sched_attr {
|
||||
__u32 sched_util_min;
|
||||
__u32 sched_util_max;
|
||||
|
||||
+ /* latency requirement hints */
|
||||
+ __s32 sched_latency_nice;
|
||||
};
|
||||
|
||||
#endif /* _UAPI_LINUX_SCHED_TYPES_H */
|
||||
diff --git a/init/init_task.c b/init/init_task.c
|
||||
index ff6c4b9bfe6b1..511cbcf3510dc 100644
|
||||
--- a/init/init_task.c
|
||||
+++ b/init/init_task.c
|
||||
@@ -78,6 +78,7 @@ struct task_struct init_task
|
||||
.prio = MAX_PRIO - 20,
|
||||
.static_prio = MAX_PRIO - 20,
|
||||
.normal_prio = MAX_PRIO - 20,
|
||||
+ .latency_prio = DEFAULT_PRIO,
|
||||
.policy = SCHED_NORMAL,
|
||||
.cpus_ptr = &init_task.cpus_mask,
|
||||
.user_cpus_ptr = NULL,
|
||||
@@ -89,7 +90,7 @@ struct task_struct init_task
|
||||
.fn = do_no_restart_syscall,
|
||||
},
|
||||
.se = {
|
||||
- .group_node = LIST_HEAD_INIT(init_task.se.group_node),
|
||||
+ .group_node = LIST_HEAD_INIT(init_task.se.group_node),
|
||||
},
|
||||
.rt = {
|
||||
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index a5d3422f7d0de..b3533d0d4a2ca 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -4757,6 +4757,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
p->prio = p->normal_prio = p->static_prio;
|
||||
set_load_weight(p, false);
|
||||
|
||||
+ p->latency_prio = NICE_TO_PRIO(0);
|
||||
+
|
||||
/*
|
||||
* We don't need the reset flag anymore after the fork. It has
|
||||
* fulfilled its duty:
|
||||
@@ -7531,7 +7533,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
|
||||
#define SETPARAM_POLICY -1
|
||||
|
||||
static void __setscheduler_params(struct task_struct *p,
|
||||
- const struct sched_attr *attr)
|
||||
+ const struct sched_attr *attr)
|
||||
{
|
||||
int policy = attr->sched_policy;
|
||||
|
||||
@@ -7555,6 +7557,13 @@ static void __setscheduler_params(struct task_struct *p,
|
||||
set_load_weight(p, true);
|
||||
}
|
||||
|
||||
+static void __setscheduler_latency(struct task_struct *p,
|
||||
+ const struct sched_attr *attr)
|
||||
+{
|
||||
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
|
||||
+ p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Check the target process has a UID that matches the current process's:
|
||||
*/
|
||||
@@ -7689,6 +7698,13 @@ recheck:
|
||||
return retval;
|
||||
}
|
||||
|
||||
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
|
||||
+ if (attr->sched_latency_nice > MAX_NICE)
|
||||
+ return -EINVAL;
|
||||
+ if (attr->sched_latency_nice < MIN_NICE)
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+
|
||||
/* Update task specific "requested" clamps */
|
||||
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
|
||||
retval = uclamp_validate(p, attr);
|
||||
@@ -7736,6 +7752,9 @@ recheck:
|
||||
goto change;
|
||||
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
|
||||
goto change;
|
||||
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
|
||||
+ attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
|
||||
+ goto change;
|
||||
|
||||
p->sched_reset_on_fork = reset_on_fork;
|
||||
retval = 0;
|
||||
@@ -7824,6 +7843,7 @@ change:
|
||||
__setscheduler_params(p, attr);
|
||||
__setscheduler_prio(p, newprio);
|
||||
}
|
||||
+ __setscheduler_latency(p, attr);
|
||||
__setscheduler_uclamp(p, attr);
|
||||
|
||||
if (queued) {
|
||||
@@ -8035,6 +8055,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
|
||||
size < SCHED_ATTR_SIZE_VER1)
|
||||
return -EINVAL;
|
||||
|
||||
+ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
|
||||
+ size < SCHED_ATTR_SIZE_VER2)
|
||||
+ return -EINVAL;
|
||||
/*
|
||||
* XXX: Do we want to be lenient like existing syscalls; or do we want
|
||||
* to be strict and return an error on out-of-bounds values?
|
||||
@@ -8272,6 +8295,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
|
||||
get_params(p, &kattr);
|
||||
kattr.sched_flags &= SCHED_FLAG_ALL;
|
||||
|
||||
+ kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio);
|
||||
+
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
/*
|
||||
* This could race with another potential updater, but this is fine
|
||||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
||||
index 4c3d0d9f3db63..5c743bcb340d2 100644
|
||||
--- a/kernel/sched/debug.c
|
||||
+++ b/kernel/sched/debug.c
|
||||
@@ -1086,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
||||
#endif
|
||||
P(policy);
|
||||
P(prio);
|
||||
+ P(latency_prio);
|
||||
if (task_has_dl_policy(p)) {
|
||||
P(dl.runtime);
|
||||
P(dl.deadline);
|
||||
diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
|
||||
index 3bac0a8ceab26..b2e932c25be62 100644
|
||||
--- a/tools/include/uapi/linux/sched.h
|
||||
+++ b/tools/include/uapi/linux/sched.h
|
||||
@@ -132,6 +132,7 @@ struct clone_args {
|
||||
#define SCHED_FLAG_KEEP_PARAMS 0x10
|
||||
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
|
||||
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
|
||||
+#define SCHED_FLAG_LATENCY_NICE 0x80
|
||||
|
||||
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
|
||||
SCHED_FLAG_KEEP_PARAMS)
|
||||
@@ -143,6 +144,7 @@ struct clone_args {
|
||||
SCHED_FLAG_RECLAIM | \
|
||||
SCHED_FLAG_DL_OVERRUN | \
|
||||
SCHED_FLAG_KEEP_ALL | \
|
||||
- SCHED_FLAG_UTIL_CLAMP)
|
||||
+ SCHED_FLAG_UTIL_CLAMP | \
|
||||
+ SCHED_FLAG_LATENCY_NICE)
|
||||
|
||||
#endif /* _UAPI_LINUX_SCHED_H */
|
||||
--
|
||||
cgit
|
||||
|
||||
From 9f9a3323112d3aa5afa466b1e391e137f28dc79d Mon Sep 17 00:00:00 2001
|
||||
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
|
||||
Date: Fri, 24 Feb 2023 10:34:51 +0100
|
||||
Subject: sched/fair: Implement latency-nice
|
||||
|
||||
Implement latency-nice as a modulation of the EEVDF r_i parameter,
|
||||
specifically apply the inverse sched_prio_to_weight[] relation on
|
||||
base_slice.
|
||||
|
||||
Given a base slice of 3 [ms], this gives a range of:
|
||||
|
||||
latency-nice 19: 3*1024 / 15 ~= 204.8 [ms]
|
||||
latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms]
|
||||
|
||||
(which might not make sense)
|
||||
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
|
||||
---
|
||||
kernel/sched/core.c | 14 ++++++++++----
|
||||
kernel/sched/fair.c | 22 +++++++++++++++-------
|
||||
kernel/sched/sched.h | 2 ++
|
||||
3 files changed, 27 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index b3533d0d4a2ca..263caac8f76b7 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
|
||||
}
|
||||
}
|
||||
|
||||
+static inline void set_latency_prio(struct task_struct *p, int prio)
|
||||
+{
|
||||
+ p->latency_prio = prio;
|
||||
+ set_latency_fair(&p->se, prio - MAX_RT_PRIO);
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
/*
|
||||
* Serializes updates of utilization clamp values
|
||||
@@ -4502,9 +4508,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
p->se.nr_migrations = 0;
|
||||
p->se.vruntime = 0;
|
||||
p->se.vlag = 0;
|
||||
- p->se.slice = sysctl_sched_base_slice;
|
||||
INIT_LIST_HEAD(&p->se.group_node);
|
||||
|
||||
+ set_latency_prio(p, p->latency_prio);
|
||||
+
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
p->se.cfs_rq = NULL;
|
||||
#endif
|
||||
@@ -4756,8 +4763,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
|
||||
p->prio = p->normal_prio = p->static_prio;
|
||||
set_load_weight(p, false);
|
||||
-
|
||||
- p->latency_prio = NICE_TO_PRIO(0);
|
||||
+ set_latency_prio(p, NICE_TO_PRIO(0));
|
||||
|
||||
/*
|
||||
* We don't need the reset flag anymore after the fork. It has
|
||||
@@ -7561,7 +7567,7 @@ static void __setscheduler_latency(struct task_struct *p,
|
||||
const struct sched_attr *attr)
|
||||
{
|
||||
if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
|
||||
- p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
|
||||
+ set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
|
||||
}
|
||||
|
||||
/*
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index 16949f7bbb172..c2019e7d46cf5 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -952,6 +952,21 @@ int sched_update_scaling(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
+void set_latency_fair(struct sched_entity *se, int prio)
|
||||
+{
|
||||
+ u32 weight = sched_prio_to_weight[prio];
|
||||
+ u64 base = sysctl_sched_base_slice;
|
||||
+
|
||||
+ /*
|
||||
+ * For EEVDF the virtual time slope is determined by w_i (iow.
|
||||
+ * nice) while the request time r_i is determined by
|
||||
+ * latency-nice.
|
||||
+ *
|
||||
+ * Smaller request gets better latency.
|
||||
+ */
|
||||
+ se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
|
||||
+}
|
||||
+
|
||||
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
|
||||
|
||||
/*
|
||||
@@ -963,13 +978,6 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
if ((s64)(se->vruntime - se->deadline) < 0)
|
||||
return;
|
||||
|
||||
- /*
|
||||
- * For EEVDF the virtual time slope is determined by w_i (iow.
|
||||
- * nice) while the request time r_i is determined by
|
||||
- * sysctl_sched_base_slice.
|
||||
- */
|
||||
- se->slice = sysctl_sched_base_slice;
|
||||
-
|
||||
/*
|
||||
* EEVDF: vd_i = ve_i + r_i / w_i
|
||||
*/
|
||||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
||||
index bc45beee335c5..8f8d903a01892 100644
|
||||
--- a/kernel/sched/sched.h
|
||||
+++ b/kernel/sched/sched.h
|
||||
@@ -2520,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
|
||||
extern unsigned int sysctl_numa_balancing_hot_threshold;
|
||||
#endif
|
||||
|
||||
+extern void set_latency_fair(struct sched_entity *se, int prio);
|
||||
+
|
||||
#ifdef CONFIG_SCHED_HRTICK
|
||||
|
||||
/*
|
||||
--
|
||||
cgit
|
||||
|
||||
From a317f35154852bc023a7ab2e3fa491e1897af72f Mon Sep 17 00:00:00 2001
|
||||
From: Vincent Guittot <vincent.guittot@linaro.org>
|
||||
Date: Fri, 24 Feb 2023 10:34:52 +0100
|
||||
Subject: sched/fair: Add sched group latency support
|
||||
|
||||
Task can set its latency priority with sched_setattr(), which is then used
|
||||
to set the latency offset of its sched_enity, but sched group entities
|
||||
still have the default latency offset value.
|
||||
|
||||
Add a latency.nice field in cpu cgroup controller to set the latency
|
||||
priority of the group similarly to sched_setattr(). The latency priority
|
||||
is then used to set the offset of the sched_entities of the group.
|
||||
|
||||
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
|
||||
Link: https://lkml.kernel.org/r/20230224093454.956298-7-vincent.guittot@linaro.org
|
||||
---
|
||||
Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++
|
||||
kernel/sched/core.c | 30 ++++++++++++++++++++++++++++++
|
||||
kernel/sched/fair.c | 27 +++++++++++++++++++++++++++
|
||||
kernel/sched/sched.h | 4 ++++
|
||||
4 files changed, 71 insertions(+)
|
||||
|
||||
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
|
||||
index 4ef8901911961..3a8d3e1e55910 100644
|
||||
--- a/Documentation/admin-guide/cgroup-v2.rst
|
||||
+++ b/Documentation/admin-guide/cgroup-v2.rst
|
||||
@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
|
||||
values similar to the sched_setattr(2). This maximum utilization
|
||||
value is used to clamp the task specific maximum utilization clamp.
|
||||
|
||||
+ cpu.latency.nice
|
||||
+ A read-write single value file which exists on non-root
|
||||
+ cgroups. The default is "0".
|
||||
+
|
||||
+ The nice value is in the range [-20, 19].
|
||||
+
|
||||
+ This interface file allows reading and setting latency using the
|
||||
+ same values used by sched_setattr(2). The latency_nice of a group is
|
||||
+ used to limit the impact of the latency_nice of a task outside the
|
||||
+ group.
|
||||
|
||||
|
||||
Memory
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index 263caac8f76b7..8a541fe2d4626 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -11247,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
|
||||
{
|
||||
return sched_group_set_idle(css_tg(css), idle);
|
||||
}
|
||||
+
|
||||
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cft)
|
||||
+{
|
||||
+ return PRIO_TO_NICE(css_tg(css)->latency_prio);
|
||||
+}
|
||||
+
|
||||
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cft, s64 nice)
|
||||
+{
|
||||
+ int prio;
|
||||
+
|
||||
+ if (nice < MIN_NICE || nice > MAX_NICE)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ prio = NICE_TO_PRIO(nice);
|
||||
+
|
||||
+ return sched_group_set_latency(css_tg(css), prio);
|
||||
+}
|
||||
#endif
|
||||
|
||||
static struct cftype cpu_legacy_files[] = {
|
||||
@@ -11261,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = {
|
||||
.read_s64 = cpu_idle_read_s64,
|
||||
.write_s64 = cpu_idle_write_s64,
|
||||
},
|
||||
+ {
|
||||
+ .name = "latency.nice",
|
||||
+ .read_s64 = cpu_latency_nice_read_s64,
|
||||
+ .write_s64 = cpu_latency_nice_write_s64,
|
||||
+ },
|
||||
#endif
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
{
|
||||
@@ -11500,6 +11524,12 @@ static struct cftype cpu_files[] = {
|
||||
.read_s64 = cpu_idle_read_s64,
|
||||
.write_s64 = cpu_idle_write_s64,
|
||||
},
|
||||
+ {
|
||||
+ .name = "latency.nice",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .read_s64 = cpu_latency_nice_read_s64,
|
||||
+ .write_s64 = cpu_latency_nice_write_s64,
|
||||
+ },
|
||||
#endif
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
{
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index c2019e7d46cf5..8a4799c600309 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -12545,6 +12545,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
|
||||
goto err;
|
||||
|
||||
tg->shares = NICE_0_LOAD;
|
||||
+ tg->latency_prio = DEFAULT_PRIO;
|
||||
|
||||
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
|
||||
|
||||
@@ -12643,6 +12644,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
||||
}
|
||||
|
||||
se->my_q = cfs_rq;
|
||||
+
|
||||
+ set_latency_fair(se, tg->latency_prio - MAX_RT_PRIO);
|
||||
+
|
||||
/* guarantee group entities always have weight */
|
||||
update_load_set(&se->load, NICE_0_LOAD);
|
||||
se->parent = parent;
|
||||
@@ -12773,6 +12777,29 @@ next_cpu:
|
||||
return 0;
|
||||
}
|
||||
|
||||
+int sched_group_set_latency(struct task_group *tg, int prio)
|
||||
+{
|
||||
+ int i;
|
||||
+
|
||||
+ if (tg == &root_task_group)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ mutex_lock(&shares_mutex);
|
||||
+
|
||||
+ if (tg->latency_prio == prio) {
|
||||
+ mutex_unlock(&shares_mutex);
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ tg->latency_prio = prio;
|
||||
+
|
||||
+ for_each_possible_cpu(i)
|
||||
+ set_latency_fair(tg->se[i], prio - MAX_RT_PRIO);
|
||||
+
|
||||
+ mutex_unlock(&shares_mutex);
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
#else /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
void free_fair_sched_group(struct task_group *tg) { }
|
||||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
||||
index 8f8d903a01892..4236c4c893aa7 100644
|
||||
--- a/kernel/sched/sched.h
|
||||
+++ b/kernel/sched/sched.h
|
||||
@@ -372,6 +372,8 @@ struct task_group {
|
||||
|
||||
/* A positive value indicates that this is a SCHED_IDLE group. */
|
||||
int idle;
|
||||
+ /* latency priority of the group. */
|
||||
+ int latency_prio;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
@@ -482,6 +484,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
|
||||
|
||||
extern int sched_group_set_idle(struct task_group *tg, long idle);
|
||||
|
||||
+extern int sched_group_set_latency(struct task_group *tg, int prio);
|
||||
+
|
||||
#ifdef CONFIG_SMP
|
||||
extern void set_task_rq_fair(struct sched_entity *se,
|
||||
struct cfs_rq *prev, struct cfs_rq *next);
|
||||
--
|
||||
cgit
|
||||
|
||||
From b412068f928064d23f67709f46d36d7659079e54 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Mon, 22 May 2023 13:46:30 +0200
|
||||
Subject: sched/eevdf: Use sched_attr::sched_runtime to set request/slice
|
||||
|
||||
As an alternative to the latency-nice interface; allow applications to
|
||||
directly set the request/slice using sched_attr::sched_runtime.
|
||||
|
||||
The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
|
||||
which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
|
||||
|
||||
Applications should strive to use their periodic runtime at a high
|
||||
confidence interval (95%+) as the target slice. Using a smaller slice
|
||||
will introduce undue preemptions, while using a larger value will
|
||||
increase latency.
|
||||
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
---
|
||||
kernel/sched/core.c | 24 ++++++++++++++++++------
|
||||
1 file changed, 18 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index 8a541fe2d4626..5b71c398f6cf6 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -7548,10 +7548,18 @@ static void __setscheduler_params(struct task_struct *p,
|
||||
|
||||
p->policy = policy;
|
||||
|
||||
- if (dl_policy(policy))
|
||||
+ if (dl_policy(policy)) {
|
||||
__setparam_dl(p, attr);
|
||||
- else if (fair_policy(policy))
|
||||
+ } else if (fair_policy(policy)) {
|
||||
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
|
||||
+ if (attr->sched_runtime) {
|
||||
+ p->se.slice = clamp_t(u64, attr->sched_runtime,
|
||||
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
|
||||
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
|
||||
+ } else {
|
||||
+ p->se.slice = sysctl_sched_base_slice;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
/*
|
||||
* __sched_setscheduler() ensures attr->sched_priority == 0 when
|
||||
@@ -7750,7 +7758,9 @@ recheck:
|
||||
* but store a possible modification of reset_on_fork.
|
||||
*/
|
||||
if (unlikely(policy == p->policy)) {
|
||||
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
|
||||
+ if (fair_policy(policy) &&
|
||||
+ (attr->sched_nice != task_nice(p) ||
|
||||
+ (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
|
||||
goto change;
|
||||
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
|
||||
goto change;
|
||||
@@ -8079,12 +8089,14 @@ err_size:
|
||||
|
||||
static void get_params(struct task_struct *p, struct sched_attr *attr)
|
||||
{
|
||||
- if (task_has_dl_policy(p))
|
||||
+ if (task_has_dl_policy(p)) {
|
||||
__getparam_dl(p, attr);
|
||||
- else if (task_has_rt_policy(p))
|
||||
+ } else if (task_has_rt_policy(p)) {
|
||||
attr->sched_priority = p->rt_priority;
|
||||
- else
|
||||
+ } else {
|
||||
attr->sched_nice = task_nice(p);
|
||||
+ attr->sched_runtime = p->se.slice;
|
||||
+ }
|
||||
}
|
||||
|
||||
/**
|
||||
--
|
||||
cgit
|
||||
|
||||
From 2f88c8e802c8b128a155976631f4eb2ce4f3c805 Mon Sep 17 00:00:00 2001
|
||||
From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
|
||||
Date: Thu, 24 Aug 2023 13:33:42 +0530
|
||||
Subject: sched/eevdf/doc: Modify the documented knob to base_slice_ns as well
|
||||
|
||||
After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns'
|
||||
sysctl to 'base_slice_ns':
|
||||
|
||||
e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
|
||||
|
||||
... but we forgot to rename it in the documentation. Do that now.
|
||||
|
||||
Fixes: e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
|
||||
Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Link: https://lore.kernel.org/r/20230824080342.543396-1-sshegde@linux.vnet.ibm.com
|
||||
---
|
||||
Documentation/scheduler/sched-design-CFS.rst | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
|
||||
index 03db555045151..f68919800f050 100644
|
||||
--- a/Documentation/scheduler/sched-design-CFS.rst
|
||||
+++ b/Documentation/scheduler/sched-design-CFS.rst
|
||||
@@ -94,7 +94,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
|
||||
way the previous scheduler had, and has no heuristics whatsoever. There is
|
||||
only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
|
||||
|
||||
- /sys/kernel/debug/sched/min_granularity_ns
|
||||
+ /sys/kernel/debug/sched/base_slice_ns
|
||||
|
||||
which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
|
||||
"server" (i.e., good batching) workloads. It defaults to a setting suitable
|
||||
--
|
||||
cgit
|
||||
|
||||
From 63304558ba5dcaaff9e052ee43cfdcc7f9c29e85 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Wed, 16 Aug 2023 15:40:59 +0200
|
||||
Subject: sched/eevdf: Curb wakeup-preemption
|
||||
|
||||
Mike and others noticed that EEVDF does like to over-schedule quite a
|
||||
bit -- which does hurt performance of a number of benchmarks /
|
||||
workloads.
|
||||
|
||||
In particular, what seems to cause over-scheduling is that when lag is
|
||||
of the same order (or larger) than the request / slice then placement
|
||||
will not only cause the task to be placed left of current, but also
|
||||
with a smaller deadline than current, which causes immediate
|
||||
preemption.
|
||||
|
||||
[ notably, lag bounds are relative to HZ ]
|
||||
|
||||
Mike suggested we stick to picking 'current' for as long as it's
|
||||
eligible to run, giving it uninterrupted runtime until it reaches
|
||||
parity with the pack.
|
||||
|
||||
Augment Mike's suggestion by only allowing it to exhaust it's initial
|
||||
request.
|
||||
|
||||
One random data point:
|
||||
|
||||
echo NO_RUN_TO_PARITY > /debug/sched/features
|
||||
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
|
||||
|
||||
3,723,554 context-switches ( +- 0.56% )
|
||||
9.5136 +- 0.0394 seconds time elapsed ( +- 0.41% )
|
||||
|
||||
echo RUN_TO_PARITY > /debug/sched/features
|
||||
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
|
||||
|
||||
2,556,535 context-switches ( +- 0.51% )
|
||||
9.2427 +- 0.0302 seconds time elapsed ( +- 0.33% )
|
||||
|
||||
Suggested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net
|
||||
---
|
||||
kernel/sched/fair.c | 12 ++++++++++++
|
||||
kernel/sched/features.h | 1 +
|
||||
2 files changed, 13 insertions(+)
|
||||
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index f496cef90ce77..0b7445cd5af98 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -873,6 +873,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
|
||||
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
|
||||
curr = NULL;
|
||||
|
||||
+ /*
|
||||
+ * Once selected, run a task until it either becomes non-eligible or
|
||||
+ * until it gets a new slice. See the HACK in set_next_entity().
|
||||
+ */
|
||||
+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
|
||||
+ return curr;
|
||||
+
|
||||
while (node) {
|
||||
struct sched_entity *se = __node_2_se(node);
|
||||
|
||||
@@ -5167,6 +5174,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
update_stats_wait_end_fair(cfs_rq, se);
|
||||
__dequeue_entity(cfs_rq, se);
|
||||
update_load_avg(cfs_rq, se, UPDATE_TG);
|
||||
+ /*
|
||||
+ * HACK, stash a copy of deadline at the point of pick in vlag,
|
||||
+ * which isn't used until dequeue.
|
||||
+ */
|
||||
+ se->vlag = se->deadline;
|
||||
}
|
||||
|
||||
update_stats_curr_start(cfs_rq, se);
|
||||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
||||
index 61bcbf5e46a45..f770168230ae4 100644
|
||||
--- a/kernel/sched/features.h
|
||||
+++ b/kernel/sched/features.h
|
||||
@@ -6,6 +6,7 @@
|
||||
*/
|
||||
SCHED_FEAT(PLACE_LAG, true)
|
||||
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
|
||||
+SCHED_FEAT(RUN_TO_PARITY, true)
|
||||
|
||||
/*
|
||||
* Prefer to schedule the task we woke last (assuming it failed
|
||||
--
|
||||
cgit
|
||||
|
||||
|
@@ -2756,3 +2756,885 @@ index 7ff9965570e69..db5853761b1f3 100644
|
||||
--
|
||||
cgit
|
||||
|
||||
From 246c6d7ab4d042b185d7df71f437137d43cbb83a Mon Sep 17 00:00:00 2001
|
||||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Sat, 25 Mar 2023 00:14:04 +0100
|
||||
Subject: sched/eevdf: Better handle mixed slice length
|
||||
|
||||
In the case where (due to latency-nice) there are different request
|
||||
sizes in the tree, the smaller requests tend to be dominated by the
|
||||
larger. Also note how the EEVDF lag limits are based on r_max.
|
||||
|
||||
Therefore; add a heuristic that for the mixed request size case, moves
|
||||
smaller requests to placement strategy #2 which ensures they're
|
||||
immidiately eligible and and due to their smaller (virtual) deadline
|
||||
will cause preemption.
|
||||
|
||||
NOTE: this relies on update_entity_lag() to impose lag limits above
|
||||
a single slice.
|
||||
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
---
|
||||
kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++++
|
||||
kernel/sched/features.h | 1 +
|
||||
kernel/sched/sched.h | 1 +
|
||||
3 files changed, 41 insertions(+)
|
||||
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index 5c8c9f7d8496a..16949f7bbb172 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
s64 key = entity_key(cfs_rq, se);
|
||||
|
||||
cfs_rq->avg_vruntime += key * weight;
|
||||
+ cfs_rq->avg_slice += se->slice * weight;
|
||||
cfs_rq->avg_load += weight;
|
||||
}
|
||||
|
||||
@@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
s64 key = entity_key(cfs_rq, se);
|
||||
|
||||
cfs_rq->avg_vruntime -= key * weight;
|
||||
+ cfs_rq->avg_slice -= se->slice * weight;
|
||||
cfs_rq->avg_load -= weight;
|
||||
}
|
||||
|
||||
@@ -4908,6 +4910,30 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
+static inline bool
|
||||
+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags)
|
||||
+{
|
||||
+ u64 now, vdelta;
|
||||
+ s64 delta;
|
||||
+
|
||||
+ if (!(flags & ENQUEUE_WAKEUP))
|
||||
+ return false;
|
||||
+
|
||||
+ if (flags & ENQUEUE_MIGRATED)
|
||||
+ return true;
|
||||
+
|
||||
+ now = rq_clock_task(rq_of(cfs_rq));
|
||||
+ delta = now - se->exec_start;
|
||||
+ if (delta < 0)
|
||||
+ return false;
|
||||
+
|
||||
+ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load);
|
||||
+ if (vdelta < vslice)
|
||||
+ return false;
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static void
|
||||
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
{
|
||||
@@ -4929,6 +4955,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
|
||||
lag = se->vlag;
|
||||
|
||||
+ /*
|
||||
+ * For latency sensitive tasks; those that have a shorter than
|
||||
+ * average slice and do not fully consume the slice, transition
|
||||
+ * to EEVDF placement strategy #2.
|
||||
+ */
|
||||
+ if (sched_feat(PLACE_FUDGE) &&
|
||||
+ (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
|
||||
+ entity_has_slept(cfs_rq, se, vslice, flags)) {
|
||||
+ lag += vslice;
|
||||
+ if (lag > 0)
|
||||
+ lag = 0;
|
||||
+ }
|
||||
+
|
||||
/*
|
||||
* If we want to place a task and preserve lag, we have to
|
||||
* consider the effect of the new entity on the weighted
|
||||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
||||
index 54334ca5c5c61..7d65b40299d91 100644
|
||||
--- a/kernel/sched/features.h
|
||||
+++ b/kernel/sched/features.h
|
||||
@@ -5,6 +5,7 @@
|
||||
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
|
||||
*/
|
||||
SCHED_FEAT(PLACE_LAG, true)
|
||||
+SCHED_FEAT(PLACE_FUDGE, true)
|
||||
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
|
||||
|
||||
/*
|
||||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
||||
index db5853761b1f3..bc45beee335c5 100644
|
||||
--- a/kernel/sched/sched.h
|
||||
+++ b/kernel/sched/sched.h
|
||||
@@ -549,6 +549,7 @@ struct cfs_rq {
|
||||
unsigned int idle_h_nr_running; /* SCHED_IDLE */
|
||||
|
||||
s64 avg_vruntime;
|
||||
+ u64 avg_slice;
|
||||
u64 avg_load;
|
||||
|
||||
u64 exec_clock;
|
||||
--
|
||||
cgit
|
||||
|
||||
From 36b9081885fee5764b53970dd2d6afe8c2f13b7f Mon Sep 17 00:00:00 2001
|
||||
From: Parth Shah <parth@linux.ibm.com>
|
||||
Date: Sat, 11 Mar 2023 12:20:21 +0100
|
||||
Subject: sched: Introduce latency-nice as a per-task attribute
|
||||
|
||||
Latency-nice indicates the latency requirements of a task with respect
|
||||
to the other tasks in the system. The value of the attribute can be within
|
||||
the range of [-20, 19] both inclusive to be in-line with the values just
|
||||
like task nice values.
|
||||
|
||||
Just like task nice, -20 is the 'highest' priority and conveys this
|
||||
task should get minimal latency, conversely 19 is the lowest priority
|
||||
and conveys this task will get the least consideration and will thus
|
||||
receive maximal latency.
|
||||
|
||||
[peterz: rebase, squash]
|
||||
Signed-off-by: Parth Shah <parth@linux.ibm.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
---
|
||||
include/linux/sched.h | 1 +
|
||||
include/uapi/linux/sched.h | 4 +++-
|
||||
include/uapi/linux/sched/types.h | 19 +++++++++++++++++++
|
||||
init/init_task.c | 3 ++-
|
||||
kernel/sched/core.c | 27 ++++++++++++++++++++++++++-
|
||||
kernel/sched/debug.c | 1 +
|
||||
tools/include/uapi/linux/sched.h | 4 +++-
|
||||
7 files changed, 55 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
||||
index 177b3f3676ef8..80bb40a63e9aa 100644
|
||||
--- a/include/linux/sched.h
|
||||
+++ b/include/linux/sched.h
|
||||
@@ -790,6 +790,7 @@ struct task_struct {
|
||||
int static_prio;
|
||||
int normal_prio;
|
||||
unsigned int rt_priority;
|
||||
+ int latency_prio;
|
||||
|
||||
struct sched_entity se;
|
||||
struct sched_rt_entity rt;
|
||||
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
|
||||
index 3bac0a8ceab26..b2e932c25be62 100644
|
||||
--- a/include/uapi/linux/sched.h
|
||||
+++ b/include/uapi/linux/sched.h
|
||||
@@ -132,6 +132,7 @@ struct clone_args {
|
||||
#define SCHED_FLAG_KEEP_PARAMS 0x10
|
||||
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
|
||||
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
|
||||
+#define SCHED_FLAG_LATENCY_NICE 0x80
|
||||
|
||||
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
|
||||
SCHED_FLAG_KEEP_PARAMS)
|
||||
@@ -143,6 +144,7 @@ struct clone_args {
|
||||
SCHED_FLAG_RECLAIM | \
|
||||
SCHED_FLAG_DL_OVERRUN | \
|
||||
SCHED_FLAG_KEEP_ALL | \
|
||||
- SCHED_FLAG_UTIL_CLAMP)
|
||||
+ SCHED_FLAG_UTIL_CLAMP | \
|
||||
+ SCHED_FLAG_LATENCY_NICE)
|
||||
|
||||
#endif /* _UAPI_LINUX_SCHED_H */
|
||||
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
|
||||
index f2c4589d4dbfe..db1e8199e8c80 100644
|
||||
--- a/include/uapi/linux/sched/types.h
|
||||
+++ b/include/uapi/linux/sched/types.h
|
||||
@@ -10,6 +10,7 @@ struct sched_param {
|
||||
|
||||
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
|
||||
#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
|
||||
+#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */
|
||||
|
||||
/*
|
||||
* Extended scheduling parameters data structure.
|
||||
@@ -98,6 +99,22 @@ struct sched_param {
|
||||
* scheduled on a CPU with no more capacity than the specified value.
|
||||
*
|
||||
* A task utilization boundary can be reset by setting the attribute to -1.
|
||||
+ *
|
||||
+ * Latency Tolerance Attributes
|
||||
+ * ===========================
|
||||
+ *
|
||||
+ * A subset of sched_attr attributes allows to specify the relative latency
|
||||
+ * requirements of a task with respect to the other tasks running/queued in the
|
||||
+ * system.
|
||||
+ *
|
||||
+ * @ sched_latency_nice task's latency_nice value
|
||||
+ *
|
||||
+ * The latency_nice of a task can have any value in a range of
|
||||
+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
|
||||
+ *
|
||||
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
|
||||
+ * taken for a task requiring a lower latency as opposed to the task with
|
||||
+ * higher latency_nice.
|
||||
*/
|
||||
struct sched_attr {
|
||||
__u32 size;
|
||||
@@ -120,6 +137,8 @@ struct sched_attr {
|
||||
__u32 sched_util_min;
|
||||
__u32 sched_util_max;
|
||||
|
||||
+ /* latency requirement hints */
|
||||
+ __s32 sched_latency_nice;
|
||||
};
|
||||
|
||||
#endif /* _UAPI_LINUX_SCHED_TYPES_H */
|
||||
diff --git a/init/init_task.c b/init/init_task.c
|
||||
index ff6c4b9bfe6b1..511cbcf3510dc 100644
|
||||
--- a/init/init_task.c
|
||||
+++ b/init/init_task.c
|
||||
@@ -78,6 +78,7 @@ struct task_struct init_task
|
||||
.prio = MAX_PRIO - 20,
|
||||
.static_prio = MAX_PRIO - 20,
|
||||
.normal_prio = MAX_PRIO - 20,
|
||||
+ .latency_prio = DEFAULT_PRIO,
|
||||
.policy = SCHED_NORMAL,
|
||||
.cpus_ptr = &init_task.cpus_mask,
|
||||
.user_cpus_ptr = NULL,
|
||||
@@ -89,7 +90,7 @@ struct task_struct init_task
|
||||
.fn = do_no_restart_syscall,
|
||||
},
|
||||
.se = {
|
||||
- .group_node = LIST_HEAD_INIT(init_task.se.group_node),
|
||||
+ .group_node = LIST_HEAD_INIT(init_task.se.group_node),
|
||||
},
|
||||
.rt = {
|
||||
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index a5d3422f7d0de..b3533d0d4a2ca 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -4757,6 +4757,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
p->prio = p->normal_prio = p->static_prio;
|
||||
set_load_weight(p, false);
|
||||
|
||||
+ p->latency_prio = NICE_TO_PRIO(0);
|
||||
+
|
||||
/*
|
||||
* We don't need the reset flag anymore after the fork. It has
|
||||
* fulfilled its duty:
|
||||
@@ -7531,7 +7533,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
|
||||
#define SETPARAM_POLICY -1
|
||||
|
||||
static void __setscheduler_params(struct task_struct *p,
|
||||
- const struct sched_attr *attr)
|
||||
+ const struct sched_attr *attr)
|
||||
{
|
||||
int policy = attr->sched_policy;
|
||||
|
||||
@@ -7555,6 +7557,13 @@ static void __setscheduler_params(struct task_struct *p,
|
||||
set_load_weight(p, true);
|
||||
}
|
||||
|
||||
+static void __setscheduler_latency(struct task_struct *p,
|
||||
+ const struct sched_attr *attr)
|
||||
+{
|
||||
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
|
||||
+ p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Check the target process has a UID that matches the current process's:
|
||||
*/
|
||||
@@ -7689,6 +7698,13 @@ recheck:
|
||||
return retval;
|
||||
}
|
||||
|
||||
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
|
||||
+ if (attr->sched_latency_nice > MAX_NICE)
|
||||
+ return -EINVAL;
|
||||
+ if (attr->sched_latency_nice < MIN_NICE)
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+
|
||||
/* Update task specific "requested" clamps */
|
||||
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
|
||||
retval = uclamp_validate(p, attr);
|
||||
@@ -7736,6 +7752,9 @@ recheck:
|
||||
goto change;
|
||||
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
|
||||
goto change;
|
||||
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
|
||||
+ attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
|
||||
+ goto change;
|
||||
|
||||
p->sched_reset_on_fork = reset_on_fork;
|
||||
retval = 0;
|
||||
@@ -7824,6 +7843,7 @@ change:
|
||||
__setscheduler_params(p, attr);
|
||||
__setscheduler_prio(p, newprio);
|
||||
}
|
||||
+ __setscheduler_latency(p, attr);
|
||||
__setscheduler_uclamp(p, attr);
|
||||
|
||||
if (queued) {
|
||||
@@ -8035,6 +8055,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
|
||||
size < SCHED_ATTR_SIZE_VER1)
|
||||
return -EINVAL;
|
||||
|
||||
+ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
|
||||
+ size < SCHED_ATTR_SIZE_VER2)
|
||||
+ return -EINVAL;
|
||||
/*
|
||||
* XXX: Do we want to be lenient like existing syscalls; or do we want
|
||||
* to be strict and return an error on out-of-bounds values?
|
||||
@@ -8272,6 +8295,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
|
||||
get_params(p, &kattr);
|
||||
kattr.sched_flags &= SCHED_FLAG_ALL;
|
||||
|
||||
+ kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio);
|
||||
+
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
/*
|
||||
* This could race with another potential updater, but this is fine
|
||||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
||||
index 4c3d0d9f3db63..5c743bcb340d2 100644
|
||||
--- a/kernel/sched/debug.c
|
||||
+++ b/kernel/sched/debug.c
|
||||
@@ -1086,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
||||
#endif
|
||||
P(policy);
|
||||
P(prio);
|
||||
+ P(latency_prio);
|
||||
if (task_has_dl_policy(p)) {
|
||||
P(dl.runtime);
|
||||
P(dl.deadline);
|
||||
diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
|
||||
index 3bac0a8ceab26..b2e932c25be62 100644
|
||||
--- a/tools/include/uapi/linux/sched.h
|
||||
+++ b/tools/include/uapi/linux/sched.h
|
||||
@@ -132,6 +132,7 @@ struct clone_args {
|
||||
#define SCHED_FLAG_KEEP_PARAMS 0x10
|
||||
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
|
||||
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
|
||||
+#define SCHED_FLAG_LATENCY_NICE 0x80
|
||||
|
||||
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
|
||||
SCHED_FLAG_KEEP_PARAMS)
|
||||
@@ -143,6 +144,7 @@ struct clone_args {
|
||||
SCHED_FLAG_RECLAIM | \
|
||||
SCHED_FLAG_DL_OVERRUN | \
|
||||
SCHED_FLAG_KEEP_ALL | \
|
||||
- SCHED_FLAG_UTIL_CLAMP)
|
||||
+ SCHED_FLAG_UTIL_CLAMP | \
|
||||
+ SCHED_FLAG_LATENCY_NICE)
|
||||
|
||||
#endif /* _UAPI_LINUX_SCHED_H */
|
||||
--
|
||||
cgit
|
||||
|
||||
From 9f9a3323112d3aa5afa466b1e391e137f28dc79d Mon Sep 17 00:00:00 2001
|
||||
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
|
||||
Date: Fri, 24 Feb 2023 10:34:51 +0100
|
||||
Subject: sched/fair: Implement latency-nice
|
||||
|
||||
Implement latency-nice as a modulation of the EEVDF r_i parameter,
|
||||
specifically apply the inverse sched_prio_to_weight[] relation on
|
||||
base_slice.
|
||||
|
||||
Given a base slice of 3 [ms], this gives a range of:
|
||||
|
||||
latency-nice 19: 3*1024 / 15 ~= 204.8 [ms]
|
||||
latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms]
|
||||
|
||||
(which might not make sense)
|
||||
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
|
||||
---
|
||||
kernel/sched/core.c | 14 ++++++++++----
|
||||
kernel/sched/fair.c | 22 +++++++++++++++-------
|
||||
kernel/sched/sched.h | 2 ++
|
||||
3 files changed, 27 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index b3533d0d4a2ca..263caac8f76b7 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
|
||||
}
|
||||
}
|
||||
|
||||
+static inline void set_latency_prio(struct task_struct *p, int prio)
|
||||
+{
|
||||
+ p->latency_prio = prio;
|
||||
+ set_latency_fair(&p->se, prio - MAX_RT_PRIO);
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
/*
|
||||
* Serializes updates of utilization clamp values
|
||||
@@ -4502,9 +4508,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
p->se.nr_migrations = 0;
|
||||
p->se.vruntime = 0;
|
||||
p->se.vlag = 0;
|
||||
- p->se.slice = sysctl_sched_base_slice;
|
||||
INIT_LIST_HEAD(&p->se.group_node);
|
||||
|
||||
+ set_latency_prio(p, p->latency_prio);
|
||||
+
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
p->se.cfs_rq = NULL;
|
||||
#endif
|
||||
@@ -4756,8 +4763,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
|
||||
p->prio = p->normal_prio = p->static_prio;
|
||||
set_load_weight(p, false);
|
||||
-
|
||||
- p->latency_prio = NICE_TO_PRIO(0);
|
||||
+ set_latency_prio(p, NICE_TO_PRIO(0));
|
||||
|
||||
/*
|
||||
* We don't need the reset flag anymore after the fork. It has
|
||||
@@ -7561,7 +7567,7 @@ static void __setscheduler_latency(struct task_struct *p,
|
||||
const struct sched_attr *attr)
|
||||
{
|
||||
if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
|
||||
- p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
|
||||
+ set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
|
||||
}
|
||||
|
||||
/*
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index 16949f7bbb172..c2019e7d46cf5 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -952,6 +952,21 @@ int sched_update_scaling(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
+void set_latency_fair(struct sched_entity *se, int prio)
|
||||
+{
|
||||
+ u32 weight = sched_prio_to_weight[prio];
|
||||
+ u64 base = sysctl_sched_base_slice;
|
||||
+
|
||||
+ /*
|
||||
+ * For EEVDF the virtual time slope is determined by w_i (iow.
|
||||
+ * nice) while the request time r_i is determined by
|
||||
+ * latency-nice.
|
||||
+ *
|
||||
+ * Smaller request gets better latency.
|
||||
+ */
|
||||
+ se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
|
||||
+}
|
||||
+
|
||||
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
|
||||
|
||||
/*
|
||||
@@ -963,13 +978,6 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
if ((s64)(se->vruntime - se->deadline) < 0)
|
||||
return;
|
||||
|
||||
- /*
|
||||
- * For EEVDF the virtual time slope is determined by w_i (iow.
|
||||
- * nice) while the request time r_i is determined by
|
||||
- * sysctl_sched_base_slice.
|
||||
- */
|
||||
- se->slice = sysctl_sched_base_slice;
|
||||
-
|
||||
/*
|
||||
* EEVDF: vd_i = ve_i + r_i / w_i
|
||||
*/
|
||||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
||||
index bc45beee335c5..8f8d903a01892 100644
|
||||
--- a/kernel/sched/sched.h
|
||||
+++ b/kernel/sched/sched.h
|
||||
@@ -2520,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
|
||||
extern unsigned int sysctl_numa_balancing_hot_threshold;
|
||||
#endif
|
||||
|
||||
+extern void set_latency_fair(struct sched_entity *se, int prio);
|
||||
+
|
||||
#ifdef CONFIG_SCHED_HRTICK
|
||||
|
||||
/*
|
||||
--
|
||||
cgit
|
||||
|
||||
From a317f35154852bc023a7ab2e3fa491e1897af72f Mon Sep 17 00:00:00 2001
|
||||
From: Vincent Guittot <vincent.guittot@linaro.org>
|
||||
Date: Fri, 24 Feb 2023 10:34:52 +0100
|
||||
Subject: sched/fair: Add sched group latency support
|
||||
|
||||
Task can set its latency priority with sched_setattr(), which is then used
|
||||
to set the latency offset of its sched_enity, but sched group entities
|
||||
still have the default latency offset value.
|
||||
|
||||
Add a latency.nice field in cpu cgroup controller to set the latency
|
||||
priority of the group similarly to sched_setattr(). The latency priority
|
||||
is then used to set the offset of the sched_entities of the group.
|
||||
|
||||
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
|
||||
Link: https://lkml.kernel.org/r/20230224093454.956298-7-vincent.guittot@linaro.org
|
||||
---
|
||||
Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++
|
||||
kernel/sched/core.c | 30 ++++++++++++++++++++++++++++++
|
||||
kernel/sched/fair.c | 27 +++++++++++++++++++++++++++
|
||||
kernel/sched/sched.h | 4 ++++
|
||||
4 files changed, 71 insertions(+)
|
||||
|
||||
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
|
||||
index 4ef8901911961..3a8d3e1e55910 100644
|
||||
--- a/Documentation/admin-guide/cgroup-v2.rst
|
||||
+++ b/Documentation/admin-guide/cgroup-v2.rst
|
||||
@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
|
||||
values similar to the sched_setattr(2). This maximum utilization
|
||||
value is used to clamp the task specific maximum utilization clamp.
|
||||
|
||||
+ cpu.latency.nice
|
||||
+ A read-write single value file which exists on non-root
|
||||
+ cgroups. The default is "0".
|
||||
+
|
||||
+ The nice value is in the range [-20, 19].
|
||||
+
|
||||
+ This interface file allows reading and setting latency using the
|
||||
+ same values used by sched_setattr(2). The latency_nice of a group is
|
||||
+ used to limit the impact of the latency_nice of a task outside the
|
||||
+ group.
|
||||
|
||||
|
||||
Memory
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index 263caac8f76b7..8a541fe2d4626 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -11247,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
|
||||
{
|
||||
return sched_group_set_idle(css_tg(css), idle);
|
||||
}
|
||||
+
|
||||
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cft)
|
||||
+{
|
||||
+ return PRIO_TO_NICE(css_tg(css)->latency_prio);
|
||||
+}
|
||||
+
|
||||
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cft, s64 nice)
|
||||
+{
|
||||
+ int prio;
|
||||
+
|
||||
+ if (nice < MIN_NICE || nice > MAX_NICE)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ prio = NICE_TO_PRIO(nice);
|
||||
+
|
||||
+ return sched_group_set_latency(css_tg(css), prio);
|
||||
+}
|
||||
#endif
|
||||
|
||||
static struct cftype cpu_legacy_files[] = {
|
||||
@@ -11261,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = {
|
||||
.read_s64 = cpu_idle_read_s64,
|
||||
.write_s64 = cpu_idle_write_s64,
|
||||
},
|
||||
+ {
|
||||
+ .name = "latency.nice",
|
||||
+ .read_s64 = cpu_latency_nice_read_s64,
|
||||
+ .write_s64 = cpu_latency_nice_write_s64,
|
||||
+ },
|
||||
#endif
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
{
|
||||
@@ -11500,6 +11524,12 @@ static struct cftype cpu_files[] = {
|
||||
.read_s64 = cpu_idle_read_s64,
|
||||
.write_s64 = cpu_idle_write_s64,
|
||||
},
|
||||
+ {
|
||||
+ .name = "latency.nice",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .read_s64 = cpu_latency_nice_read_s64,
|
||||
+ .write_s64 = cpu_latency_nice_write_s64,
|
||||
+ },
|
||||
#endif
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
{
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index c2019e7d46cf5..8a4799c600309 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -12545,6 +12545,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
|
||||
goto err;
|
||||
|
||||
tg->shares = NICE_0_LOAD;
|
||||
+ tg->latency_prio = DEFAULT_PRIO;
|
||||
|
||||
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
|
||||
|
||||
@@ -12643,6 +12644,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
||||
}
|
||||
|
||||
se->my_q = cfs_rq;
|
||||
+
|
||||
+ set_latency_fair(se, tg->latency_prio - MAX_RT_PRIO);
|
||||
+
|
||||
/* guarantee group entities always have weight */
|
||||
update_load_set(&se->load, NICE_0_LOAD);
|
||||
se->parent = parent;
|
||||
@@ -12773,6 +12777,29 @@ next_cpu:
|
||||
return 0;
|
||||
}
|
||||
|
||||
+int sched_group_set_latency(struct task_group *tg, int prio)
|
||||
+{
|
||||
+ int i;
|
||||
+
|
||||
+ if (tg == &root_task_group)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ mutex_lock(&shares_mutex);
|
||||
+
|
||||
+ if (tg->latency_prio == prio) {
|
||||
+ mutex_unlock(&shares_mutex);
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ tg->latency_prio = prio;
|
||||
+
|
||||
+ for_each_possible_cpu(i)
|
||||
+ set_latency_fair(tg->se[i], prio - MAX_RT_PRIO);
|
||||
+
|
||||
+ mutex_unlock(&shares_mutex);
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
#else /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
void free_fair_sched_group(struct task_group *tg) { }
|
||||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
||||
index 8f8d903a01892..4236c4c893aa7 100644
|
||||
--- a/kernel/sched/sched.h
|
||||
+++ b/kernel/sched/sched.h
|
||||
@@ -372,6 +372,8 @@ struct task_group {
|
||||
|
||||
/* A positive value indicates that this is a SCHED_IDLE group. */
|
||||
int idle;
|
||||
+ /* latency priority of the group. */
|
||||
+ int latency_prio;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
@@ -482,6 +484,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
|
||||
|
||||
extern int sched_group_set_idle(struct task_group *tg, long idle);
|
||||
|
||||
+extern int sched_group_set_latency(struct task_group *tg, int prio);
|
||||
+
|
||||
#ifdef CONFIG_SMP
|
||||
extern void set_task_rq_fair(struct sched_entity *se,
|
||||
struct cfs_rq *prev, struct cfs_rq *next);
|
||||
--
|
||||
cgit
|
||||
|
||||
From b412068f928064d23f67709f46d36d7659079e54 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Mon, 22 May 2023 13:46:30 +0200
|
||||
Subject: sched/eevdf: Use sched_attr::sched_runtime to set request/slice
|
||||
|
||||
As an alternative to the latency-nice interface; allow applications to
|
||||
directly set the request/slice using sched_attr::sched_runtime.
|
||||
|
||||
The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
|
||||
which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
|
||||
|
||||
Applications should strive to use their periodic runtime at a high
|
||||
confidence interval (95%+) as the target slice. Using a smaller slice
|
||||
will introduce undue preemptions, while using a larger value will
|
||||
increase latency.
|
||||
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
---
|
||||
kernel/sched/core.c | 24 ++++++++++++++++++------
|
||||
1 file changed, 18 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index 8a541fe2d4626..5b71c398f6cf6 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -7548,10 +7548,18 @@ static void __setscheduler_params(struct task_struct *p,
|
||||
|
||||
p->policy = policy;
|
||||
|
||||
- if (dl_policy(policy))
|
||||
+ if (dl_policy(policy)) {
|
||||
__setparam_dl(p, attr);
|
||||
- else if (fair_policy(policy))
|
||||
+ } else if (fair_policy(policy)) {
|
||||
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
|
||||
+ if (attr->sched_runtime) {
|
||||
+ p->se.slice = clamp_t(u64, attr->sched_runtime,
|
||||
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
|
||||
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
|
||||
+ } else {
|
||||
+ p->se.slice = sysctl_sched_base_slice;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
/*
|
||||
* __sched_setscheduler() ensures attr->sched_priority == 0 when
|
||||
@@ -7750,7 +7758,9 @@ recheck:
|
||||
* but store a possible modification of reset_on_fork.
|
||||
*/
|
||||
if (unlikely(policy == p->policy)) {
|
||||
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
|
||||
+ if (fair_policy(policy) &&
|
||||
+ (attr->sched_nice != task_nice(p) ||
|
||||
+ (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
|
||||
goto change;
|
||||
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
|
||||
goto change;
|
||||
@@ -8079,12 +8089,14 @@ err_size:
|
||||
|
||||
static void get_params(struct task_struct *p, struct sched_attr *attr)
|
||||
{
|
||||
- if (task_has_dl_policy(p))
|
||||
+ if (task_has_dl_policy(p)) {
|
||||
__getparam_dl(p, attr);
|
||||
- else if (task_has_rt_policy(p))
|
||||
+ } else if (task_has_rt_policy(p)) {
|
||||
attr->sched_priority = p->rt_priority;
|
||||
- else
|
||||
+ } else {
|
||||
attr->sched_nice = task_nice(p);
|
||||
+ attr->sched_runtime = p->se.slice;
|
||||
+ }
|
||||
}
|
||||
|
||||
/**
|
||||
--
|
||||
cgit
|
||||
|
||||
From 2f88c8e802c8b128a155976631f4eb2ce4f3c805 Mon Sep 17 00:00:00 2001
|
||||
From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
|
||||
Date: Thu, 24 Aug 2023 13:33:42 +0530
|
||||
Subject: sched/eevdf/doc: Modify the documented knob to base_slice_ns as well
|
||||
|
||||
After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns'
|
||||
sysctl to 'base_slice_ns':
|
||||
|
||||
e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
|
||||
|
||||
... but we forgot to rename it in the documentation. Do that now.
|
||||
|
||||
Fixes: e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice")
|
||||
Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Link: https://lore.kernel.org/r/20230824080342.543396-1-sshegde@linux.vnet.ibm.com
|
||||
---
|
||||
Documentation/scheduler/sched-design-CFS.rst | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
|
||||
index 03db555045151..f68919800f050 100644
|
||||
--- a/Documentation/scheduler/sched-design-CFS.rst
|
||||
+++ b/Documentation/scheduler/sched-design-CFS.rst
|
||||
@@ -94,7 +94,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
|
||||
way the previous scheduler had, and has no heuristics whatsoever. There is
|
||||
only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
|
||||
|
||||
- /sys/kernel/debug/sched/min_granularity_ns
|
||||
+ /sys/kernel/debug/sched/base_slice_ns
|
||||
|
||||
which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
|
||||
"server" (i.e., good batching) workloads. It defaults to a setting suitable
|
||||
--
|
||||
cgit
|
||||
|
||||
From 63304558ba5dcaaff9e052ee43cfdcc7f9c29e85 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Wed, 16 Aug 2023 15:40:59 +0200
|
||||
Subject: sched/eevdf: Curb wakeup-preemption
|
||||
|
||||
Mike and others noticed that EEVDF does like to over-schedule quite a
|
||||
bit -- which does hurt performance of a number of benchmarks /
|
||||
workloads.
|
||||
|
||||
In particular, what seems to cause over-scheduling is that when lag is
|
||||
of the same order (or larger) than the request / slice then placement
|
||||
will not only cause the task to be placed left of current, but also
|
||||
with a smaller deadline than current, which causes immediate
|
||||
preemption.
|
||||
|
||||
[ notably, lag bounds are relative to HZ ]
|
||||
|
||||
Mike suggested we stick to picking 'current' for as long as it's
|
||||
eligible to run, giving it uninterrupted runtime until it reaches
|
||||
parity with the pack.
|
||||
|
||||
Augment Mike's suggestion by only allowing it to exhaust it's initial
|
||||
request.
|
||||
|
||||
One random data point:
|
||||
|
||||
echo NO_RUN_TO_PARITY > /debug/sched/features
|
||||
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
|
||||
|
||||
3,723,554 context-switches ( +- 0.56% )
|
||||
9.5136 +- 0.0394 seconds time elapsed ( +- 0.41% )
|
||||
|
||||
echo RUN_TO_PARITY > /debug/sched/features
|
||||
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
|
||||
|
||||
2,556,535 context-switches ( +- 0.51% )
|
||||
9.2427 +- 0.0302 seconds time elapsed ( +- 0.33% )
|
||||
|
||||
Suggested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net
|
||||
---
|
||||
kernel/sched/fair.c | 12 ++++++++++++
|
||||
kernel/sched/features.h | 1 +
|
||||
2 files changed, 13 insertions(+)
|
||||
|
||||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
||||
index f496cef90ce77..0b7445cd5af98 100644
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -873,6 +873,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
|
||||
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
|
||||
curr = NULL;
|
||||
|
||||
+ /*
|
||||
+ * Once selected, run a task until it either becomes non-eligible or
|
||||
+ * until it gets a new slice. See the HACK in set_next_entity().
|
||||
+ */
|
||||
+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
|
||||
+ return curr;
|
||||
+
|
||||
while (node) {
|
||||
struct sched_entity *se = __node_2_se(node);
|
||||
|
||||
@@ -5167,6 +5174,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
update_stats_wait_end_fair(cfs_rq, se);
|
||||
__dequeue_entity(cfs_rq, se);
|
||||
update_load_avg(cfs_rq, se, UPDATE_TG);
|
||||
+ /*
|
||||
+ * HACK, stash a copy of deadline at the point of pick in vlag,
|
||||
+ * which isn't used until dequeue.
|
||||
+ */
|
||||
+ se->vlag = se->deadline;
|
||||
}
|
||||
|
||||
update_stats_curr_start(cfs_rq, se);
|
||||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
||||
index 61bcbf5e46a45..f770168230ae4 100644
|
||||
--- a/kernel/sched/features.h
|
||||
+++ b/kernel/sched/features.h
|
||||
@@ -6,6 +6,7 @@
|
||||
*/
|
||||
SCHED_FEAT(PLACE_LAG, true)
|
||||
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
|
||||
+SCHED_FEAT(RUN_TO_PARITY, true)
|
||||
|
||||
/*
|
||||
* Prefer to schedule the task we woke last (assuming it failed
|
||||
--
|
||||
cgit
|
||||
|
||||
|
Reference in New Issue
Block a user