diff --git a/linux-tkg-patches/6.5/0003-eevdf.patch b/linux-tkg-patches/6.5/0003-eevdf.patch index c73f78f..b52b29c 100644 --- a/linux-tkg-patches/6.5/0003-eevdf.patch +++ b/linux-tkg-patches/6.5/0003-eevdf.patch @@ -1,37 +1,526 @@ -From af4cf40470c22efa3987200fd19478199e08e103 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 31 May 2023 13:58:40 +0200 -Subject: sched/fair: Add cfs_rq::avg_vruntime +From f40e0ab8e18aa28a5a37e0a7574559f2a914d697 Mon Sep 17 00:00:00 2001 +From: Piotr Gorski +Date: Fri, 15 Sep 2023 18:05:03 +0200 +Subject: [PATCH] sched-6.5: Introduce EEVDF -In order to move to an eligibility based scheduling policy, we need -to have a better approximation of the ideal scheduler. - -Specifically, for a virtual time weighted fair queueing based -scheduler the ideal scheduler will be the weighted average of the -individual virtual runtimes (math in the comment). - -As such, compute the weighted average to approximate the ideal -scheduler -- note that the approximation is in the individual task -behaviour, which isn't strictly conformant. - -Specifically consider adding a task with a vruntime left of center, in -this case the average will move backwards in time -- something the -ideal scheduler would of course never do. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230531124603.654144274@infradead.org +Signed-off-by: Piotr Gorski --- - kernel/sched/debug.c | 32 ++++++------ - kernel/sched/fair.c | 137 +++++++++++++++++++++++++++++++++++++++++++++++++-- - kernel/sched/sched.h | 5 ++ - 3 files changed, 154 insertions(+), 20 deletions(-) + Documentation/admin-guide/cgroup-v2.rst | 10 + + Documentation/scheduler/sched-design-CFS.rst | 2 +- + include/linux/rbtree_augmented.h | 26 + + include/linux/sched.h | 9 +- + include/uapi/linux/sched.h | 4 +- + include/uapi/linux/sched/types.h | 19 + + init/init_task.c | 3 +- + kernel/sched/core.c | 130 +- + kernel/sched/debug.c | 49 +- + kernel/sched/fair.c | 1141 +++++++++--------- + kernel/sched/features.h | 25 +- + kernel/sched/sched.h | 24 +- + tools/include/uapi/linux/sched.h | 4 +- + 13 files changed, 783 insertions(+), 663 deletions(-) +diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst +index 4ef890191..3a8d3e1e5 100644 +--- a/Documentation/admin-guide/cgroup-v2.rst ++++ b/Documentation/admin-guide/cgroup-v2.rst +@@ -1121,6 +1121,16 @@ All time durations are in microseconds. + values similar to the sched_setattr(2). This maximum utilization + value is used to clamp the task specific maximum utilization clamp. + ++ cpu.latency.nice ++ A read-write single value file which exists on non-root ++ cgroups. The default is "0". ++ ++ The nice value is in the range [-20, 19]. ++ ++ This interface file allows reading and setting latency using the ++ same values used by sched_setattr(2). The latency_nice of a group is ++ used to limit the impact of the latency_nice of a task outside the ++ group. + + + Memory +diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst +index 03db55504..f68919800 100644 +--- a/Documentation/scheduler/sched-design-CFS.rst ++++ b/Documentation/scheduler/sched-design-CFS.rst +@@ -94,7 +94,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the + way the previous scheduler had, and has no heuristics whatsoever. There is + only one central tunable (you have to switch on CONFIG_SCHED_DEBUG): + +- /sys/kernel/debug/sched/min_granularity_ns ++ /sys/kernel/debug/sched/base_slice_ns + + which can be used to tune the scheduler from "desktop" (i.e., low latencies) to + "server" (i.e., good batching) workloads. It defaults to a setting suitable +diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h +index 7ee7ed5de..6dbc5a1bf 100644 +--- a/include/linux/rbtree_augmented.h ++++ b/include/linux/rbtree_augmented.h +@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node, + rb_insert_augmented(node, &root->rb_root, augment); + } + ++static __always_inline struct rb_node * ++rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, ++ bool (*less)(struct rb_node *, const struct rb_node *), ++ const struct rb_augment_callbacks *augment) ++{ ++ struct rb_node **link = &tree->rb_root.rb_node; ++ struct rb_node *parent = NULL; ++ bool leftmost = true; ++ ++ while (*link) { ++ parent = *link; ++ if (less(node, parent)) { ++ link = &parent->rb_left; ++ } else { ++ link = &parent->rb_right; ++ leftmost = false; ++ } ++ } ++ ++ rb_link_node(node, parent, link); ++ augment->propagate(parent, NULL); /* suboptimal */ ++ rb_insert_augmented_cached(node, tree, leftmost, augment); ++ ++ return leftmost ? node : NULL; ++} ++ + /* + * Template for declaring augmented rbtree callbacks (generic case) + * +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 609bde814..e6f3a5e38 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -549,13 +549,18 @@ struct sched_entity { + /* For load-balancing: */ + struct load_weight load; + struct rb_node run_node; ++ u64 deadline; ++ u64 min_deadline; ++ + struct list_head group_node; + unsigned int on_rq; + + u64 exec_start; + u64 sum_exec_runtime; +- u64 vruntime; + u64 prev_sum_exec_runtime; ++ u64 vruntime; ++ s64 vlag; ++ u64 slice; + + u64 nr_migrations; + +@@ -785,6 +790,7 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++ int latency_prio; + + struct sched_entity se; + struct sched_rt_entity rt; +@@ -886,6 +892,7 @@ struct task_struct { + unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; + unsigned sched_migrated:1; ++ unsigned sched_delayed:1; + + /* Force alignment to the next boundary: */ + unsigned :0; +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ce..b2e932c25 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -132,6 +132,7 @@ struct clone_args { + #define SCHED_FLAG_KEEP_PARAMS 0x10 + #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 + #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 ++#define SCHED_FLAG_LATENCY_NICE 0x80 + + #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ + SCHED_FLAG_KEEP_PARAMS) +@@ -143,6 +144,7 @@ struct clone_args { + SCHED_FLAG_RECLAIM | \ + SCHED_FLAG_DL_OVERRUN | \ + SCHED_FLAG_KEEP_ALL | \ +- SCHED_FLAG_UTIL_CLAMP) ++ SCHED_FLAG_UTIL_CLAMP | \ ++ SCHED_FLAG_LATENCY_NICE) + + #endif /* _UAPI_LINUX_SCHED_H */ +diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h +index f2c4589d4..db1e8199e 100644 +--- a/include/uapi/linux/sched/types.h ++++ b/include/uapi/linux/sched/types.h +@@ -10,6 +10,7 @@ struct sched_param { + + #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ + #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ ++#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */ + + /* + * Extended scheduling parameters data structure. +@@ -98,6 +99,22 @@ struct sched_param { + * scheduled on a CPU with no more capacity than the specified value. + * + * A task utilization boundary can be reset by setting the attribute to -1. ++ * ++ * Latency Tolerance Attributes ++ * =========================== ++ * ++ * A subset of sched_attr attributes allows to specify the relative latency ++ * requirements of a task with respect to the other tasks running/queued in the ++ * system. ++ * ++ * @ sched_latency_nice task's latency_nice value ++ * ++ * The latency_nice of a task can have any value in a range of ++ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE]. ++ * ++ * A task with latency_nice with the value of LATENCY_NICE_MIN can be ++ * taken for a task requiring a lower latency as opposed to the task with ++ * higher latency_nice. + */ + struct sched_attr { + __u32 size; +@@ -120,6 +137,8 @@ struct sched_attr { + __u32 sched_util_min; + __u32 sched_util_max; + ++ /* latency requirement hints */ ++ __s32 sched_latency_nice; + }; + + #endif /* _UAPI_LINUX_SCHED_TYPES_H */ +diff --git a/init/init_task.c b/init/init_task.c +index ff6c4b9bf..511cbcf35 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -78,6 +78,7 @@ struct task_struct init_task + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++ .latency_prio = DEFAULT_PRIO, + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .user_cpus_ptr = NULL, +@@ -89,7 +90,7 @@ struct task_struct init_task + .fn = do_no_restart_syscall, + }, + .se = { +- .group_node = LIST_HEAD_INIT(init_task.se.group_node), ++ .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, + .rt = { + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index c52c2eba7..095c46027 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load) + } + } + ++static inline void set_latency_prio(struct task_struct *p, int prio) ++{ ++ p->latency_prio = prio; ++ set_latency_fair(&p->se, prio - MAX_RT_PRIO); ++} ++ + #ifdef CONFIG_UCLAMP_TASK + /* + * Serializes updates of utilization clamp values +@@ -4501,8 +4507,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; ++ p->se.vlag = 0; + INIT_LIST_HEAD(&p->se.group_node); + ++ set_latency_prio(p, p->latency_prio); ++ + #ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; + #endif +@@ -4754,6 +4763,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + + p->prio = p->normal_prio = p->static_prio; + set_load_weight(p, false); ++ set_latency_prio(p, NICE_TO_PRIO(0)); + + /* + * We don't need the reset flag anymore after the fork. It has +@@ -6549,6 +6559,16 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + # define SM_MASK_PREEMPT SM_PREEMPT + #endif + ++static void __deschedule_task(struct rq *rq, struct task_struct *p) ++{ ++ deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); ++ ++ if (p->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++} ++ + /* + * __schedule() is the main scheduler function. + * +@@ -6661,17 +6681,36 @@ static void __sched notrace __schedule(unsigned int sched_mode) + * + * After this, schedule() must not care about p->state any more. + */ +- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); ++ if (!(sched_feat(DELAY_DEQUEUE) && ++ prev->sched_class->eligible_task && ++ !prev->sched_class->eligible_task(rq, prev))) ++ __deschedule_task(rq, prev); ++ else ++ prev->sched_delayed = 1; ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ for (struct task_struct *tmp = prev;;) { + +- if (prev->in_iowait) { +- atomic_inc(&rq->nr_iowait); +- delayacct_blkio_start(); ++ next = pick_next_task(rq, tmp, &rf); ++ if (unlikely(tmp != prev)) ++ finish_task(tmp); ++ ++ if (sched_feat(DELAY_DEQUEUE) && unlikely(next->sched_delayed)) { ++ next->sched_delayed = 0; ++ if (READ_ONCE(next->__state)) { ++ prepare_task(next); ++ smp_wmb(); ++ __deschedule_task(rq, next); ++ tmp = next; ++ continue; + } + } +- switch_count = &prev->nvcsw; ++ ++ break; + } + +- next = pick_next_task(rq, prev, &rf); + clear_tsk_need_resched(prev); + clear_preempt_need_resched(); + #ifdef CONFIG_SCHED_DEBUG +@@ -7516,7 +7555,7 @@ static struct task_struct *find_process_by_pid(pid_t pid) + #define SETPARAM_POLICY -1 + + static void __setscheduler_params(struct task_struct *p, +- const struct sched_attr *attr) ++ const struct sched_attr *attr) + { + int policy = attr->sched_policy; + +@@ -7525,10 +7564,18 @@ static void __setscheduler_params(struct task_struct *p, + + p->policy = policy; + +- if (dl_policy(policy)) ++ if (dl_policy(policy)) { + __setparam_dl(p, attr); +- else if (fair_policy(policy)) ++ } else if (fair_policy(policy)) { + p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ if (attr->sched_runtime) { ++ p->se.slice = clamp_t(u64, attr->sched_runtime, ++ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ ++ NSEC_PER_MSEC*100); /* HZ=100 / 10 */ ++ } else { ++ p->se.slice = sysctl_sched_base_slice; ++ } ++ } + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when +@@ -7540,6 +7587,13 @@ static void __setscheduler_params(struct task_struct *p, + set_load_weight(p, true); + } + ++static void __setscheduler_latency(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) ++ set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice)); ++} ++ + /* + * Check the target process has a UID that matches the current process's: + */ +@@ -7674,6 +7728,13 @@ static int __sched_setscheduler(struct task_struct *p, + return retval; + } + ++ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { ++ if (attr->sched_latency_nice > MAX_NICE) ++ return -EINVAL; ++ if (attr->sched_latency_nice < MIN_NICE) ++ return -EINVAL; ++ } ++ + /* Update task specific "requested" clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { + retval = uclamp_validate(p, attr); +@@ -7713,7 +7774,9 @@ static int __sched_setscheduler(struct task_struct *p, + * but store a possible modification of reset_on_fork. + */ + if (unlikely(policy == p->policy)) { +- if (fair_policy(policy) && attr->sched_nice != task_nice(p)) ++ if (fair_policy(policy) && ++ (attr->sched_nice != task_nice(p) || ++ (attr->sched_runtime && attr->sched_runtime != p->se.slice))) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; +@@ -7721,6 +7784,9 @@ static int __sched_setscheduler(struct task_struct *p, + goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + goto change; ++ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && ++ attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio)) ++ goto change; + + p->sched_reset_on_fork = reset_on_fork; + retval = 0; +@@ -7809,6 +7875,7 @@ static int __sched_setscheduler(struct task_struct *p, + __setscheduler_params(p, attr); + __setscheduler_prio(p, newprio); + } ++ __setscheduler_latency(p, attr); + __setscheduler_uclamp(p, attr); + + if (queued) { +@@ -8020,6 +8087,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a + size < SCHED_ATTR_SIZE_VER1) + return -EINVAL; + ++ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) && ++ size < SCHED_ATTR_SIZE_VER2) ++ return -EINVAL; + /* + * XXX: Do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? +@@ -8035,12 +8105,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a + + static void get_params(struct task_struct *p, struct sched_attr *attr) + { +- if (task_has_dl_policy(p)) ++ if (task_has_dl_policy(p)) { + __getparam_dl(p, attr); +- else if (task_has_rt_policy(p)) ++ } else if (task_has_rt_policy(p)) { + attr->sched_priority = p->rt_priority; +- else ++ } else { + attr->sched_nice = task_nice(p); ++ attr->sched_runtime = p->se.slice; ++ } + } + + /** +@@ -8257,6 +8329,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; + ++ kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio); ++ + #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine +@@ -11180,6 +11254,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + { + return sched_group_set_idle(css_tg(css), idle); + } ++ ++static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css, ++ struct cftype *cft) ++{ ++ return PRIO_TO_NICE(css_tg(css)->latency_prio); ++} ++ ++static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css, ++ struct cftype *cft, s64 nice) ++{ ++ int prio; ++ ++ if (nice < MIN_NICE || nice > MAX_NICE) ++ return -ERANGE; ++ ++ prio = NICE_TO_PRIO(nice); ++ ++ return sched_group_set_latency(css_tg(css), prio); ++} + #endif + + static struct cftype cpu_legacy_files[] = { +@@ -11194,6 +11287,11 @@ static struct cftype cpu_legacy_files[] = { + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, ++ { ++ .name = "latency.nice", ++ .read_s64 = cpu_latency_nice_read_s64, ++ .write_s64 = cpu_latency_nice_write_s64, ++ }, + #endif + #ifdef CONFIG_CFS_BANDWIDTH + { +@@ -11411,6 +11509,12 @@ static struct cftype cpu_files[] = { + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, ++ { ++ .name = "latency.nice", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .read_s64 = cpu_latency_nice_read_s64, ++ .write_s64 = cpu_latency_nice_write_s64, ++ }, + #endif + #ifdef CONFIG_CFS_BANDWIDTH + { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index aeeba46a096b9..e48d2b2db7bca 100644 +index 066ff1c8a..e7e83181f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c -@@ -627,10 +627,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) +@@ -347,10 +347,7 @@ static __init int sched_init_debug(void) + debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); + #endif + +- debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); +- debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); +- debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); +- debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); ++ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); + + debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); + debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); +@@ -581,9 +578,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + else + SEQ_printf(m, " %c", task_state_to_char(p)); + +- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", ++ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", + p->comm, task_pid_nr(p), + SPLIT_NS(p->se.vruntime), ++ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', ++ SPLIT_NS(p->se.deadline), ++ SPLIT_NS(p->se.slice), ++ SPLIT_NS(p->se.sum_exec_runtime), + (long long)(p->nvcsw + p->nivcsw), + p->prio); + +@@ -626,10 +627,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { @@ -44,7 +533,7 @@ index aeeba46a096b9..e48d2b2db7bca 100644 unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -644,26 +643,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +@@ -643,26 +643,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); @@ -84,11 +573,133 @@ index aeeba46a096b9..e48d2b2db7bca 100644 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); +@@ -863,10 +862,7 @@ static void sched_debug_header(struct seq_file *m) + SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) + #define PN(x) \ + SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) +- PN(sysctl_sched_latency); +- PN(sysctl_sched_min_granularity); +- PN(sysctl_sched_idle_min_granularity); +- PN(sysctl_sched_wakeup_granularity); ++ PN(sysctl_sched_base_slice); + P(sysctl_sched_child_runs_first); + P(sysctl_sched_features); + #undef PN +@@ -1089,6 +1085,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + #endif + P(policy); + P(prio); ++ P(latency_prio); + if (task_has_dl_policy(p)) { + P(dl.runtime); + P(dl.deadline); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index d3df5b1642a6f..bb5460682ae2e 100644 +index 1d9c2482c..4f23f545e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c -@@ -601,9 +601,134 @@ static inline bool entity_before(const struct sched_entity *a, +@@ -47,6 +47,7 @@ + #include + #include + #include ++#include + + #include + +@@ -56,22 +57,6 @@ + #include "stats.h" + #include "autogroup.h" + +-/* +- * Targeted preemption latency for CPU-bound tasks: +- * +- * NOTE: this latency value is not the same as the concept of +- * 'timeslice length' - timeslices in CFS are of variable length +- * and have no persistent notion like in traditional, time-slice +- * based scheduling concepts. +- * +- * (to see the precise effective timeslice length of your workload, +- * run vmstat and monitor the context-switches (cs) field) +- * +- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) +- */ +-unsigned int sysctl_sched_latency = 6000000ULL; +-static unsigned int normalized_sysctl_sched_latency = 6000000ULL; +- + /* + * The initial- and re-scaling of tunables is configurable + * +@@ -90,21 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ +-unsigned int sysctl_sched_min_granularity = 750000ULL; +-static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +- +-/* +- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. +- * Applies only when SCHED_IDLE tasks compete with normal tasks. +- * +- * (default: 0.75 msec) +- */ +-unsigned int sysctl_sched_idle_min_granularity = 750000ULL; +- +-/* +- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity +- */ +-static unsigned int sched_nr_latency = 8; ++unsigned int sysctl_sched_base_slice = 750000ULL; ++static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; + + /* + * After fork, child runs first. If set to 0 (default) then +@@ -112,18 +84,6 @@ static unsigned int sched_nr_latency = 8; + */ + unsigned int sysctl_sched_child_runs_first __read_mostly; + +-/* +- * SCHED_OTHER wake-up granularity. +- * +- * This option delays the preemption effects of decoupled workloads +- * and reduces their over-scheduling. Synchronous workloads will still +- * have immediate wakeup/sleep latencies. +- * +- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) +- */ +-unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +- + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + + int sched_thermal_decay_shift; +@@ -277,9 +237,7 @@ static void update_sysctl(void) + + #define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) +- SET_SYSCTL(sched_min_granularity); +- SET_SYSCTL(sched_latency); +- SET_SYSCTL(sched_wakeup_granularity); ++ SET_SYSCTL(sched_base_slice); + #undef SET_SYSCTL + } + +@@ -347,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight + return mul_u64_u32_shr(delta_exec, fact, shift); + } + ++/* ++ * delta /= w ++ */ ++static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) ++{ ++ if (unlikely(se->load.weight != NICE_0_LOAD)) ++ delta = __calc_delta(delta, NICE_0_LOAD, &se->load); ++ ++ return delta; ++} + + const struct sched_class fair_sched_class; + +@@ -601,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a, return (s64)(a->vruntime - b->vruntime) < 0; } @@ -206,6 +817,66 @@ index d3df5b1642a6f..bb5460682ae2e 100644 + return cfs_rq->min_vruntime + avg; +} + ++/* ++ * lag_i = S - s_i = w_i * (V - v_i) ++ * ++ * However, since V is approximated by the weighted average of all entities it ++ * is possible -- by addition/removal/reweight to the tree -- to move V around ++ * and end up with a larger lag than we started with. ++ * ++ * Limit this to either double the slice length with a minimum of TICK_NSEC ++ * since that is the timing granularity. ++ * ++ * EEVDF gives the following limit for a steady state system: ++ * ++ * -r_max < lag < max(r_max, q) ++ * ++ * XXX could add max_slice to the augmented data to track this. ++ */ ++void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) ++{ ++ s64 lag, limit; ++ ++ SCHED_WARN_ON(!se->on_rq); ++ lag = avg_vruntime(cfs_rq) - se->vruntime; ++ ++ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); ++ se->vlag = clamp(lag, -limit, limit); ++} ++ ++/* ++ * Entity is eligible once it received less service than it ought to have, ++ * eg. lag >= 0. ++ * ++ * lag_i = S - s_i = w_i*(V - v_i) ++ * ++ * lag_i >= 0 -> V >= v_i ++ * ++ * \Sum (v_i - v)*w_i ++ * V = ------------------ + v ++ * \Sum w_i ++ * ++ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) ++ * ++ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due ++ * to the loss in precision caused by the division. ++ */ ++int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) ++{ ++ struct sched_entity *curr = cfs_rq->curr; ++ s64 avg = cfs_rq->avg_vruntime; ++ long load = cfs_rq->avg_load; ++ ++ if (curr && curr->on_rq) { ++ unsigned long weight = scale_load_down(curr->load.weight); ++ ++ avg += entity_key(cfs_rq, curr) * weight; ++ load += weight; ++ } ++ ++ return avg >= entity_key(cfs_rq, se) * load; ++} ++ +static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +{ + u64 min_vruntime = cfs_rq->min_vruntime; @@ -222,8 +893,24 @@ index d3df5b1642a6f..bb5460682ae2e 100644 + static void update_min_vruntime(struct cfs_rq *cfs_rq) { ++ struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; -@@ -629,7 +754,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) +- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); + + u64 vruntime = cfs_rq->min_vruntime; + +@@ -618,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) + curr = NULL; + } + +- if (leftmost) { /* non-empty tree */ +- struct sched_entity *se = __node_2_se(leftmost); +- ++ if (se) { + if (!curr) + vruntime = se->vruntime; + else +@@ -629,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) /* ensure we never gain time by being placed backwards. */ u64_u32_store(cfs_rq->min_vruntime, @@ -232,22 +919,302 @@ index d3df5b1642a6f..bb5460682ae2e 100644 } static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) -@@ -642,12 +767,14 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +@@ -637,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) + return entity_before(__node_2_se(a), __node_2_se(b)); + } + ++#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) ++ ++static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) ++{ ++ if (node) { ++ struct sched_entity *rse = __node_2_se(node); ++ if (deadline_gt(min_deadline, se, rse)) ++ se->min_deadline = rse->min_deadline; ++ } ++} ++ ++/* ++ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) ++ */ ++static inline bool min_deadline_update(struct sched_entity *se, bool exit) ++{ ++ u64 old_min_deadline = se->min_deadline; ++ struct rb_node *node = &se->run_node; ++ ++ se->min_deadline = se->deadline; ++ __update_min_deadline(se, node->rb_right); ++ __update_min_deadline(se, node->rb_left); ++ ++ return se->min_deadline == old_min_deadline; ++} ++ ++RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, ++ run_node, min_deadline, min_deadline_update); ++ + /* + * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { +- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); + avg_vruntime_add(cfs_rq, se); - rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); ++ se->min_deadline = se->deadline; ++ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, ++ __entity_less, &min_deadline_cb); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); +- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); ++ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, ++ &min_deadline_cb); + avg_vruntime_sub(cfs_rq, se); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) -@@ -3379,6 +3506,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +@@ -660,14 +845,88 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) + return __node_2_se(left); + } + +-static struct sched_entity *__pick_next_entity(struct sched_entity *se) ++/* ++ * Earliest Eligible Virtual Deadline First ++ * ++ * In order to provide latency guarantees for different request sizes ++ * EEVDF selects the best runnable task from two criteria: ++ * ++ * 1) the task must be eligible (must be owed service) ++ * ++ * 2) from those tasks that meet 1), we select the one ++ * with the earliest virtual deadline. ++ * ++ * We can do this in O(log n) time due to an augmented RB-tree. The ++ * tree keeps the entries sorted on service, but also functions as a ++ * heap based on the deadline by keeping: ++ * ++ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) ++ * ++ * Which allows an EDF like search on (sub)trees. ++ */ ++static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) + { +- struct rb_node *next = rb_next(&se->run_node); ++ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; ++ struct sched_entity *curr = cfs_rq->curr; ++ struct sched_entity *best = NULL; + +- if (!next) +- return NULL; ++ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) ++ curr = NULL; ++ ++ /* ++ * Once selected, run a task until it either becomes non-eligible or ++ * until it gets a new slice. See the HACK in set_next_entity(). ++ */ ++ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) ++ return curr; ++ ++ while (node) { ++ struct sched_entity *se = __node_2_se(node); ++ ++ /* ++ * If this entity is not eligible, try the left subtree. ++ */ ++ if (!entity_eligible(cfs_rq, se)) { ++ node = node->rb_left; ++ continue; ++ } ++ ++ /* ++ * If this entity has an earlier deadline than the previous ++ * best, take this one. If it also has the earliest deadline ++ * of its subtree, we're done. ++ */ ++ if (!best || deadline_gt(deadline, best, se)) { ++ best = se; ++ if (best->deadline == best->min_deadline) ++ break; ++ } ++ ++ /* ++ * If the earlest deadline in this subtree is in the fully ++ * eligible left half of our space, go there. ++ */ ++ if (node->rb_left && ++ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { ++ node = node->rb_left; ++ continue; ++ } ++ ++ node = node->rb_right; ++ } ++ ++ if (!best || (curr && deadline_gt(deadline, best, curr))) ++ best = curr; + +- return __node_2_se(next); ++ if (unlikely(!best)) { ++ struct sched_entity *left = __pick_first_entity(cfs_rq); ++ if (left) { ++ pr_err("EEVDF scheduling fail, picking leftmost\n"); ++ return left; ++ } ++ } ++ ++ return best; + } + + #ifdef CONFIG_SCHED_DEBUG +@@ -689,104 +948,53 @@ int sched_update_scaling(void) + { + unsigned int factor = get_update_sysctl_factor(); + +- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, +- sysctl_sched_min_granularity); +- + #define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) +- WRT_SYSCTL(sched_min_granularity); +- WRT_SYSCTL(sched_latency); +- WRT_SYSCTL(sched_wakeup_granularity); ++ WRT_SYSCTL(sched_base_slice); + #undef WRT_SYSCTL + + return 0; + } + #endif + +-/* +- * delta /= w +- */ +-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) ++void set_latency_fair(struct sched_entity *se, int prio) + { +- if (unlikely(se->load.weight != NICE_0_LOAD)) +- delta = __calc_delta(delta, NICE_0_LOAD, &se->load); ++ u32 weight = sched_prio_to_weight[prio]; ++ u64 base = sysctl_sched_base_slice; + +- return delta; +-} +- +-/* +- * The idea is to set a period in which each task runs once. +- * +- * When there are too many tasks (sched_nr_latency) we have to stretch +- * this period because otherwise the slices get too small. +- * +- * p = (nr <= nl) ? l : l*nr/nl +- */ +-static u64 __sched_period(unsigned long nr_running) +-{ +- if (unlikely(nr_running > sched_nr_latency)) +- return nr_running * sysctl_sched_min_granularity; +- else +- return sysctl_sched_latency; ++ /* ++ * For EEVDF the virtual time slope is determined by w_i (iow. ++ * nice) while the request time r_i is determined by ++ * latency-nice. ++ * ++ * Smaller request gets better latency. ++ */ ++ se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight); + } + +-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); ++static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); + + /* +- * We calculate the wall-time slice from the period by taking a part +- * proportional to the weight. +- * +- * s = p*P[w/rw] ++ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i ++ * this is probably good enough. + */ +-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) ++static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +- unsigned int nr_running = cfs_rq->nr_running; +- struct sched_entity *init_se = se; +- unsigned int min_gran; +- u64 slice; +- +- if (sched_feat(ALT_PERIOD)) +- nr_running = rq_of(cfs_rq)->cfs.h_nr_running; +- +- slice = __sched_period(nr_running + !se->on_rq); +- +- for_each_sched_entity(se) { +- struct load_weight *load; +- struct load_weight lw; +- struct cfs_rq *qcfs_rq; +- +- qcfs_rq = cfs_rq_of(se); +- load = &qcfs_rq->load; +- +- if (unlikely(!se->on_rq)) { +- lw = qcfs_rq->load; +- +- update_load_add(&lw, se->load.weight); +- load = &lw; +- } +- slice = __calc_delta(slice, se->load.weight, load); +- } ++ if ((s64)(se->vruntime - se->deadline) < 0) ++ return; + +- if (sched_feat(BASE_SLICE)) { +- if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) +- min_gran = sysctl_sched_idle_min_granularity; +- else +- min_gran = sysctl_sched_min_granularity; ++ /* ++ * EEVDF: vd_i = ve_i + r_i / w_i ++ */ ++ se->deadline = se->vruntime + calc_delta_fair(se->slice, se); + +- slice = max_t(u64, slice, min_gran); ++ /* ++ * The task has consumed its request, reschedule. ++ */ ++ if (cfs_rq->nr_running > 1) { ++ resched_curr(rq_of(cfs_rq)); ++ clear_buddies(cfs_rq, se); + } +- +- return slice; +-} +- +-/* +- * We calculate the vruntime slice of a to-be-inserted task. +- * +- * vs = s/w +- */ +-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) +-{ +- return calc_delta_fair(sched_slice(cfs_rq, se), se); + } + + #include "pelt.h" +@@ -921,6 +1129,7 @@ static void update_curr(struct cfs_rq *cfs_rq) + schedstat_add(cfs_rq->exec_clock, delta_exec); + + curr->vruntime += calc_delta_fair(delta_exec, curr); ++ update_deadline(cfs_rq, curr); + update_min_vruntime(cfs_rq); + + if (entity_is_task(curr)) { +@@ -3375,16 +3584,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) + { ++ unsigned long old_weight = se->load.weight; ++ + if (se->on_rq) { /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); @@ -256,7 +1223,29 @@ index d3df5b1642a6f..bb5460682ae2e 100644 update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); -@@ -3394,9 +3523,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + + update_load_set(&se->load, weight); + ++ if (!se->on_rq) { ++ /* ++ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), ++ * we need to scale se->vlag when w_i changes. ++ */ ++ se->vlag = div_s64(se->vlag * old_weight, weight); ++ } else { ++ s64 deadline = se->deadline - se->vruntime; ++ /* ++ * When the weight changes, the virtual time slope changes and ++ * we should adjust the relative virtual deadline accordingly. ++ */ ++ deadline = div_s64(deadline * old_weight, weight); ++ se->deadline = se->vruntime + deadline; ++ } ++ + #ifdef CONFIG_SMP + do { + u32 divider = get_pelt_divider(&se->avg); +@@ -3394,9 +3623,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #endif enqueue_load_avg(cfs_rq, se); @@ -270,208 +1259,38 @@ index d3df5b1642a6f..bb5460682ae2e 100644 } void reweight_task(struct task_struct *p, int prio) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 9baeb1a2dfdd4..52a0a4bde1939 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -548,6 +548,9 @@ struct cfs_rq { - unsigned int idle_nr_running; /* SCHED_IDLE */ - unsigned int idle_h_nr_running; /* SCHED_IDLE */ +@@ -4692,158 +4923,125 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} -+ s64 avg_vruntime; -+ u64 avg_load; -+ - u64 exec_clock; - u64 min_vruntime; - #ifdef CONFIG_SCHED_CORE -@@ -3483,4 +3486,6 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } - static inline void init_sched_mm_cid(struct task_struct *t) { } - #endif + #endif /* CONFIG_SMP */ -+extern u64 avg_vruntime(struct cfs_rq *cfs_rq); -+ - #endif /* _KERNEL_SCHED_SCHED_H */ --- -cgit - -From e0c2ff903c320d3fd3c2c604dc401b3b7c0a1d13 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 31 May 2023 13:58:41 +0200 -Subject: sched/fair: Remove sched_feat(START_DEBIT) - -With the introduction of avg_vruntime() there is no need to use worse -approximations. Take the 0-lag point as starting point for inserting -new tasks. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230531124603.722361178@infradead.org ---- - kernel/sched/fair.c | 21 +-------------------- - kernel/sched/features.h | 6 ------ - 2 files changed, 1 insertion(+), 26 deletions(-) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index bb5460682ae2e..fc43482c13e99 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -906,16 +906,6 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) - return slice; - } - --/* -- * We calculate the vruntime slice of a to-be-inserted task. -- * -- * vs = s/w -- */ --static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) +-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -- return calc_delta_fair(sched_slice(cfs_rq, se), se); +-#ifdef CONFIG_SCHED_DEBUG +- s64 d = se->vruntime - cfs_rq->min_vruntime; +- +- if (d < 0) +- d = -d; +- +- if (d > 3*sysctl_sched_latency) +- schedstat_inc(cfs_rq->nr_spread_over); +-#endif -} - - #include "pelt.h" - #ifdef CONFIG_SMP - -@@ -4862,16 +4852,7 @@ static inline bool entity_is_long_sleeper(struct sched_entity *se) - static void - place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +-static inline bool entity_is_long_sleeper(struct sched_entity *se) ++static void ++place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { -- u64 vruntime = cfs_rq->min_vruntime; -- -- /* -- * The 'current' period is already promised to the current tasks, -- * however the extra weight of the new task will slow them down a -- * little, place the new task so that it fits in the slot that -- * stays open at the end. -- */ -- if (initial && sched_feat(START_DEBIT)) -- vruntime += sched_vslice(cfs_rq, se); -+ u64 vruntime = avg_vruntime(cfs_rq); - - /* sleeps up to a single latency don't count. */ - if (!initial) { -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index ee7f23c76bd33..fa828b36533df 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -6,12 +6,6 @@ - */ - SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) - --/* -- * Place new tasks ahead so that they do not starve already running -- * tasks -- */ --SCHED_FEAT(START_DEBIT, true) -- - /* - * Prefer to schedule the task we woke last (assuming it failed - * wakeup-preemption), since its likely going to consume data we --- -cgit - -From 86bfbb7ce4f67a88df2639198169b685668e7349 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 31 May 2023 13:58:42 +0200 -Subject: sched/fair: Add lag based placement - -With the introduction of avg_vruntime, it is possible to approximate -lag (the entire purpose of introducing it in fact). Use this to do lag -based placement over sleep+wake. - -Specifically, the FAIR_SLEEPERS thing places things too far to the -left and messes up the deadline aspect of EEVDF. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230531124603.794929315@infradead.org ---- - include/linux/sched.h | 3 +- - kernel/sched/core.c | 1 + - kernel/sched/fair.c | 168 +++++++++++++++++++++++++++++++++++++----------- - kernel/sched/features.h | 8 +++ - 4 files changed, 141 insertions(+), 39 deletions(-) - -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 2aab7be46f7e8..ba1828b2a6a50 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -554,8 +554,9 @@ struct sched_entity { - - u64 exec_start; - u64 sum_exec_runtime; -- u64 vruntime; - u64 prev_sum_exec_runtime; -+ u64 vruntime; -+ s64 vlag; - - u64 nr_migrations; - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 83e36547af176..84b0d47ed9b85 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -4501,6 +4501,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) - p->se.prev_sum_exec_runtime = 0; - p->se.nr_migrations = 0; - p->se.vruntime = 0; -+ p->se.vlag = 0; - INIT_LIST_HEAD(&p->se.group_node); - - #ifdef CONFIG_FAIR_GROUP_SCHED -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index fc43482c13e99..dd12ada69b121 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -715,6 +715,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) - return cfs_rq->min_vruntime + avg; - } - -+/* -+ * lag_i = S - s_i = w_i * (V - v_i) -+ */ -+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) -+{ -+ SCHED_WARN_ON(!se->on_rq); -+ se->vlag = avg_vruntime(cfs_rq) - se->vruntime; -+} -+ - static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) - { - u64 min_vruntime = cfs_rq->min_vruntime; -@@ -3492,6 +3501,8 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } - static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) - { -+ unsigned long old_weight = se->load.weight; -+ - if (se->on_rq) { - /* commit outstanding execution time */ - if (cfs_rq->curr == se) -@@ -3504,6 +3515,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - - update_load_set(&se->load, weight); - -+ if (!se->on_rq) { -+ /* -+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), -+ * we need to scale se->vlag when w_i changes. -+ */ -+ se->vlag = div_s64(se->vlag * old_weight, weight); -+ } -+ - #ifdef CONFIG_SMP - do { - u32 divider = get_pelt_divider(&se->avg); -@@ -4853,49 +4872,119 @@ static void - place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - { - u64 vruntime = avg_vruntime(cfs_rq); +- struct cfs_rq *cfs_rq; +- u64 sleep_time; ++ u64 vslice, vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; -- /* sleeps up to a single latency don't count. */ -- if (!initial) { -- unsigned long thresh; +- if (se->exec_start == 0) +- return false; ++ se->slice = sysctl_sched_base_slice; ++ vslice = calc_delta_fair(se->slice, se); + +- cfs_rq = cfs_rq_of(se); + /* + * Due to how V is constructed as the weighted average of entities, + * adding tasks with positive lag, or removing tasks with negative lag @@ -480,19 +1299,21 @@ index fc43482c13e99..dd12ada69b121 100644 + * + * EEVDF: placement strategy #1 / #2 + */ -+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) { ++ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + struct sched_entity *curr = cfs_rq->curr; + unsigned long load; -- if (se_is_idle(se)) -- thresh = sysctl_sched_min_granularity; -- else -- thresh = sysctl_sched_latency; +- sleep_time = rq_clock_task(rq_of(cfs_rq)); + lag = se->vlag; - /* -- * Halve their sleep time's effect, to allow -- * for a gentler effect of sleepers: +- /* Happen while migrating because of clock task divergence */ +- if (sleep_time <= se->exec_start) +- return false; +- +- sleep_time -= se->exec_start; +- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) +- return true; ++ /* + * If we want to place a task and preserve lag, we have to + * consider the effect of the new entity on the weighted + * average and compensate for this, otherwise lag can quickly @@ -543,7 +1364,52 @@ index fc43482c13e99..dd12ada69b121 100644 + * = W*vl_i + * + * vl_i = (W + w_i)*vl'_i / W - */ ++ */ ++ load = cfs_rq->avg_load; ++ if (curr && curr->on_rq) ++ load += scale_load_down(curr->load.weight); + +- return false; +-} ++ lag *= load + scale_load_down(se->load.weight); ++ if (WARN_ON_ONCE(!load)) ++ load = 1; ++ lag = div_s64(lag, load); ++ } + +-static void +-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +-{ +- u64 vruntime = cfs_rq->min_vruntime; ++ se->vruntime = vruntime - lag; + + /* +- * The 'current' period is already promised to the current tasks, +- * however the extra weight of the new task will slow them down a +- * little, place the new task so that it fits in the slot that +- * stays open at the end. ++ * When joining the competition; the exisiting tasks will be, ++ * on average, halfway through their slice, as such start tasks ++ * off with half a slice to ease into the competition. + */ +- if (initial && sched_feat(START_DEBIT)) +- vruntime += sched_vslice(cfs_rq, se); +- +- /* sleeps up to a single latency don't count. */ +- if (!initial) { +- unsigned long thresh; ++ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) ++ vslice /= 2; + +- if (se_is_idle(se)) +- thresh = sysctl_sched_min_granularity; +- else +- thresh = sysctl_sched_latency; +- +- /* +- * Halve their sleep time's effect, to allow +- * for a gentler effect of sleepers: +- */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; - @@ -573,632 +1439,6 @@ index fc43482c13e99..dd12ada69b121 100644 - se->vruntime = vruntime; - else - se->vruntime = max_vruntime(se->vruntime, vruntime); -+ load = cfs_rq->avg_load; -+ if (curr && curr->on_rq) -+ load += curr->load.weight; -+ -+ lag *= load + se->load.weight; -+ if (WARN_ON_ONCE(!load)) -+ load = 1; -+ lag = div_s64(lag, load); -+ -+ vruntime -= lag; -+ } -+ -+ if (sched_feat(FAIR_SLEEPERS)) { -+ -+ /* sleeps up to a single latency don't count. */ -+ if (!initial) { -+ unsigned long thresh; -+ -+ if (se_is_idle(se)) -+ thresh = sysctl_sched_min_granularity; -+ else -+ thresh = sysctl_sched_latency; -+ -+ /* -+ * Halve their sleep time's effect, to allow -+ * for a gentler effect of sleepers: -+ */ -+ if (sched_feat(GENTLE_FAIR_SLEEPERS)) -+ thresh >>= 1; -+ -+ vruntime -= thresh; -+ } -+ -+ /* -+ * Pull vruntime of the entity being placed to the base level of -+ * cfs_rq, to prevent boosting it if placed backwards. If the entity -+ * slept for a long time, don't even try to compare its vruntime with -+ * the base as it may be too far off and the comparison may get -+ * inversed due to s64 overflow. -+ */ -+ if (!entity_is_long_sleeper(se)) -+ vruntime = max_vruntime(se->vruntime, vruntime); -+ } -+ -+ se->vruntime = vruntime; - } - - static void check_enqueue_throttle(struct cfs_rq *cfs_rq); -@@ -5077,6 +5166,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - - clear_buddies(cfs_rq, se); - -+ if (flags & DEQUEUE_SLEEP) -+ update_entity_lag(cfs_rq, se); -+ - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); - se->on_rq = 0; -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index fa828b36533df..7958a10fe23bb 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -1,11 +1,19 @@ - /* SPDX-License-Identifier: GPL-2.0 */ -+ - /* - * Only give sleepers 50% of their service deficit. This allows - * them to run sooner, but does not allow tons of sleepers to - * rip the spread apart. - */ -+SCHED_FEAT(FAIR_SLEEPERS, false) - SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) - -+/* -+ * Using the avg_vruntime, do the right thing and preserve lag across -+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. -+ */ -+SCHED_FEAT(PLACE_LAG, true) -+ - /* - * Prefer to schedule the task we woke last (assuming it failed - * wakeup-preemption), since its likely going to consume data we --- -cgit - -From 99d4d26551b56f4e523dd04e4970b94aa796a64e Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 31 May 2023 13:58:43 +0200 -Subject: rbtree: Add rb_add_augmented_cached() helper - -While slightly sub-optimal, updating the augmented data while going -down the tree during lookup would be faster -- alas the augment -interface does not currently allow for that, provide a generic helper -to add a node to an augmented cached tree. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230531124603.862983648@infradead.org ---- - include/linux/rbtree_augmented.h | 26 ++++++++++++++++++++++++++ - 1 file changed, 26 insertions(+) - -diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h -index 7ee7ed5de7227..6dbc5a1bf6a8c 100644 ---- a/include/linux/rbtree_augmented.h -+++ b/include/linux/rbtree_augmented.h -@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node, - rb_insert_augmented(node, &root->rb_root, augment); - } - -+static __always_inline struct rb_node * -+rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, -+ bool (*less)(struct rb_node *, const struct rb_node *), -+ const struct rb_augment_callbacks *augment) -+{ -+ struct rb_node **link = &tree->rb_root.rb_node; -+ struct rb_node *parent = NULL; -+ bool leftmost = true; -+ -+ while (*link) { -+ parent = *link; -+ if (less(node, parent)) { -+ link = &parent->rb_left; -+ } else { -+ link = &parent->rb_right; -+ leftmost = false; -+ } -+ } -+ -+ rb_link_node(node, parent, link); -+ augment->propagate(parent, NULL); /* suboptimal */ -+ rb_insert_augmented_cached(node, tree, leftmost, augment); -+ -+ return leftmost ? node : NULL; -+} -+ - /* - * Template for declaring augmented rbtree callbacks (generic case) - * --- -cgit - -From 147f3efaa24182a21706bca15eab2f3f4630b5fe Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 31 May 2023 13:58:44 +0200 -Subject: sched/fair: Implement an EEVDF-like scheduling policy - -Where CFS is currently a WFQ based scheduler with only a single knob, -the weight. The addition of a second, latency oriented parameter, -makes something like WF2Q or EEVDF based a much better fit. - -Specifically, EEVDF does EDF like scheduling in the left half of the -tree -- those entities that are owed service. Except because this is a -virtual time scheduler, the deadlines are in virtual time as well, -which is what allows over-subscription. - -EEVDF has two parameters: - - - weight, or time-slope: which is mapped to nice just as before - - - request size, or slice length: which is used to compute - the virtual deadline as: vd_i = ve_i + r_i/w_i - -Basically, by setting a smaller slice, the deadline will be earlier -and the task will be more eligible and ran earlier. - -Tick driven preemption is driven by request/slice completion; while -wakeup preemption is driven by the deadline. - -Because the tree is now effectively an interval tree, and the -selection is no longer 'leftmost', over-scheduling is less of a -problem. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230531124603.931005524@infradead.org ---- - include/linux/sched.h | 4 + - kernel/sched/core.c | 1 + - kernel/sched/debug.c | 6 +- - kernel/sched/fair.c | 338 +++++++++++++++++++++++++++++++++++++++++------- - kernel/sched/features.h | 3 + - kernel/sched/sched.h | 4 +- - 6 files changed, 308 insertions(+), 48 deletions(-) - -diff --git a/include/linux/sched.h b/include/linux/sched.h -index ba1828b2a6a50..177b3f3676ef8 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -549,6 +549,9 @@ struct sched_entity { - /* For load-balancing: */ - struct load_weight load; - struct rb_node run_node; -+ u64 deadline; -+ u64 min_deadline; -+ - struct list_head group_node; - unsigned int on_rq; - -@@ -557,6 +560,7 @@ struct sched_entity { - u64 prev_sum_exec_runtime; - u64 vruntime; - s64 vlag; -+ u64 slice; - - u64 nr_migrations; - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 84b0d47ed9b85..e85a2fd258e2b 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -4502,6 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) - p->se.nr_migrations = 0; - p->se.vruntime = 0; - p->se.vlag = 0; -+ p->se.slice = sysctl_sched_min_granularity; - INIT_LIST_HEAD(&p->se.group_node); - - #ifdef CONFIG_FAIR_GROUP_SCHED -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index e48d2b2db7bca..18efc6d0cc5ab 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -582,9 +582,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) - else - SEQ_printf(m, " %c", task_state_to_char(p)); - -- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", -+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", - p->comm, task_pid_nr(p), - SPLIT_NS(p->se.vruntime), -+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', -+ SPLIT_NS(p->se.deadline), -+ SPLIT_NS(p->se.slice), -+ SPLIT_NS(p->se.sum_exec_runtime), - (long long)(p->nvcsw + p->nivcsw), - p->prio); - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index dd12ada69b121..4d3505dba476e 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -47,6 +47,7 @@ - #include - #include - #include -+#include - - #include - -@@ -347,6 +348,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight - return mul_u64_u32_shr(delta_exec, fact, shift); - } - -+/* -+ * delta /= w -+ */ -+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) -+{ -+ if (unlikely(se->load.weight != NICE_0_LOAD)) -+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load); -+ -+ return delta; -+} - - const struct sched_class fair_sched_class; - -@@ -717,11 +728,62 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) - - /* - * lag_i = S - s_i = w_i * (V - v_i) -+ * -+ * However, since V is approximated by the weighted average of all entities it -+ * is possible -- by addition/removal/reweight to the tree -- to move V around -+ * and end up with a larger lag than we started with. -+ * -+ * Limit this to either double the slice length with a minimum of TICK_NSEC -+ * since that is the timing granularity. -+ * -+ * EEVDF gives the following limit for a steady state system: -+ * -+ * -r_max < lag < max(r_max, q) -+ * -+ * XXX could add max_slice to the augmented data to track this. - */ - void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) - { -+ s64 lag, limit; -+ - SCHED_WARN_ON(!se->on_rq); -- se->vlag = avg_vruntime(cfs_rq) - se->vruntime; -+ lag = avg_vruntime(cfs_rq) - se->vruntime; -+ -+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); -+ se->vlag = clamp(lag, -limit, limit); -+} -+ -+/* -+ * Entity is eligible once it received less service than it ought to have, -+ * eg. lag >= 0. -+ * -+ * lag_i = S - s_i = w_i*(V - v_i) -+ * -+ * lag_i >= 0 -> V >= v_i -+ * -+ * \Sum (v_i - v)*w_i -+ * V = ------------------ + v -+ * \Sum w_i -+ * -+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) -+ * -+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due -+ * to the loss in precision caused by the division. -+ */ -+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) -+{ -+ struct sched_entity *curr = cfs_rq->curr; -+ s64 avg = cfs_rq->avg_vruntime; -+ long load = cfs_rq->avg_load; -+ -+ if (curr && curr->on_rq) { -+ unsigned long weight = scale_load_down(curr->load.weight); -+ -+ avg += entity_key(cfs_rq, curr) * weight; -+ load += weight; -+ } -+ -+ return avg >= entity_key(cfs_rq, se) * load; - } - - static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) -@@ -740,8 +802,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) - - static void update_min_vruntime(struct cfs_rq *cfs_rq) - { -+ struct sched_entity *se = __pick_first_entity(cfs_rq); - struct sched_entity *curr = cfs_rq->curr; -- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); - - u64 vruntime = cfs_rq->min_vruntime; - -@@ -752,9 +814,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) - curr = NULL; - } - -- if (leftmost) { /* non-empty tree */ -- struct sched_entity *se = __node_2_se(leftmost); -- -+ if (se) { - if (!curr) - vruntime = se->vruntime; - else -@@ -771,18 +831,50 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) - return entity_before(__node_2_se(a), __node_2_se(b)); - } - -+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) -+ -+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) -+{ -+ if (node) { -+ struct sched_entity *rse = __node_2_se(node); -+ if (deadline_gt(min_deadline, se, rse)) -+ se->min_deadline = rse->min_deadline; -+ } -+} -+ -+/* -+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) -+ */ -+static inline bool min_deadline_update(struct sched_entity *se, bool exit) -+{ -+ u64 old_min_deadline = se->min_deadline; -+ struct rb_node *node = &se->run_node; -+ -+ se->min_deadline = se->deadline; -+ __update_min_deadline(se, node->rb_right); -+ __update_min_deadline(se, node->rb_left); -+ -+ return se->min_deadline == old_min_deadline; -+} -+ -+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, -+ run_node, min_deadline, min_deadline_update); -+ - /* - * Enqueue an entity into the rb-tree: - */ - static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - { - avg_vruntime_add(cfs_rq, se); -- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); -+ se->min_deadline = se->deadline; -+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, -+ __entity_less, &min_deadline_cb); - } - - static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - { -- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); -+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, -+ &min_deadline_cb); - avg_vruntime_sub(cfs_rq, se); - } - -@@ -806,6 +898,97 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) - return __node_2_se(next); - } - -+static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr) -+{ -+ struct sched_entity *left = __pick_first_entity(cfs_rq); -+ -+ /* -+ * If curr is set we have to see if its left of the leftmost entity -+ * still in the tree, provided there was anything in the tree at all. -+ */ -+ if (!left || (curr && entity_before(curr, left))) -+ left = curr; -+ -+ return left; -+} -+ -+/* -+ * Earliest Eligible Virtual Deadline First -+ * -+ * In order to provide latency guarantees for different request sizes -+ * EEVDF selects the best runnable task from two criteria: -+ * -+ * 1) the task must be eligible (must be owed service) -+ * -+ * 2) from those tasks that meet 1), we select the one -+ * with the earliest virtual deadline. -+ * -+ * We can do this in O(log n) time due to an augmented RB-tree. The -+ * tree keeps the entries sorted on service, but also functions as a -+ * heap based on the deadline by keeping: -+ * -+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) -+ * -+ * Which allows an EDF like search on (sub)trees. -+ */ -+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) -+{ -+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; -+ struct sched_entity *curr = cfs_rq->curr; -+ struct sched_entity *best = NULL; -+ -+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) -+ curr = NULL; -+ -+ while (node) { -+ struct sched_entity *se = __node_2_se(node); -+ -+ /* -+ * If this entity is not eligible, try the left subtree. -+ */ -+ if (!entity_eligible(cfs_rq, se)) { -+ node = node->rb_left; -+ continue; -+ } -+ -+ /* -+ * If this entity has an earlier deadline than the previous -+ * best, take this one. If it also has the earliest deadline -+ * of its subtree, we're done. -+ */ -+ if (!best || deadline_gt(deadline, best, se)) { -+ best = se; -+ if (best->deadline == best->min_deadline) -+ break; -+ } -+ -+ /* -+ * If the earlest deadline in this subtree is in the fully -+ * eligible left half of our space, go there. -+ */ -+ if (node->rb_left && -+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { -+ node = node->rb_left; -+ continue; -+ } -+ -+ node = node->rb_right; -+ } -+ -+ if (!best || (curr && deadline_gt(deadline, best, curr))) -+ best = curr; -+ -+ if (unlikely(!best)) { -+ struct sched_entity *left = __pick_first_entity(cfs_rq); -+ if (left) { -+ pr_err("EEVDF scheduling fail, picking leftmost\n"); -+ return left; -+ } -+ } -+ -+ return best; -+} -+ - #ifdef CONFIG_SCHED_DEBUG - struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) - { -@@ -839,17 +1022,6 @@ int sched_update_scaling(void) - } - #endif - --/* -- * delta /= w -- */ --static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) --{ -- if (unlikely(se->load.weight != NICE_0_LOAD)) -- delta = __calc_delta(delta, NICE_0_LOAD, &se->load); -- -- return delta; --} -- - /* - * The idea is to set a period in which each task runs once. - * -@@ -915,6 +1087,48 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) - return slice; - } - -+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); -+ -+/* -+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i -+ * this is probably good enough. -+ */ -+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) -+{ -+ if ((s64)(se->vruntime - se->deadline) < 0) -+ return; -+ -+ if (sched_feat(EEVDF)) { -+ /* -+ * For EEVDF the virtual time slope is determined by w_i (iow. -+ * nice) while the request time r_i is determined by -+ * sysctl_sched_min_granularity. -+ */ -+ se->slice = sysctl_sched_min_granularity; -+ -+ /* -+ * The task has consumed its request, reschedule. -+ */ -+ if (cfs_rq->nr_running > 1) { -+ resched_curr(rq_of(cfs_rq)); -+ clear_buddies(cfs_rq, se); -+ } -+ } else { -+ /* -+ * When many tasks blow up the sched_period; it is possible -+ * that sched_slice() reports unusually large results (when -+ * many tasks are very light for example). Therefore impose a -+ * maximum. -+ */ -+ se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency); -+ } -+ -+ /* -+ * EEVDF: vd_i = ve_i + r_i / w_i -+ */ -+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se); -+} -+ - #include "pelt.h" - #ifdef CONFIG_SMP - -@@ -1047,6 +1261,7 @@ static void update_curr(struct cfs_rq *cfs_rq) - schedstat_add(cfs_rq->exec_clock, delta_exec); - - curr->vruntime += calc_delta_fair(delta_exec, curr); -+ update_deadline(cfs_rq, curr); - update_min_vruntime(cfs_rq); - - if (entity_is_task(curr)) { -@@ -3521,6 +3736,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - * we need to scale se->vlag when w_i changes. - */ - se->vlag = div_s64(se->vlag * old_weight, weight); -+ } else { -+ s64 deadline = se->deadline - se->vruntime; -+ /* -+ * When the weight changes, the virtual time slope changes and -+ * we should adjust the relative virtual deadline accordingly. -+ */ -+ deadline = div_s64(deadline * old_weight, weight); -+ se->deadline = se->vruntime + deadline; - } - - #ifdef CONFIG_SMP -@@ -4871,6 +5094,7 @@ static inline bool entity_is_long_sleeper(struct sched_entity *se) - static void - place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - { -+ u64 vslice = calc_delta_fair(se->slice, se); - u64 vruntime = avg_vruntime(cfs_rq); - s64 lag = 0; - -@@ -4942,9 +5166,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - */ - load = cfs_rq->avg_load; - if (curr && curr->on_rq) -- load += curr->load.weight; -+ load += scale_load_down(curr->load.weight); - -- lag *= load + se->load.weight; -+ lag *= load + scale_load_down(se->load.weight); - if (WARN_ON_ONCE(!load)) - load = 1; - lag = div_s64(lag, load); -@@ -4985,6 +5209,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - } - - se->vruntime = vruntime; -+ -+ /* -+ * When joining the competition; the exisiting tasks will be, -+ * on average, halfway through their slice, as such start tasks -+ * off with half a slice to ease into the competition. -+ */ -+ if (sched_feat(PLACE_DEADLINE_INITIAL) && initial) -+ vslice /= 2; -+ + /* + * EEVDF: vd_i = ve_i + r_i/w_i + */ @@ -1206,353 +1446,6 @@ index dd12ada69b121..4d3505dba476e 100644 } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); -@@ -5207,19 +5444,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - static void - check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) - { -- unsigned long ideal_runtime, delta_exec; -+ unsigned long delta_exec; - struct sched_entity *se; - s64 delta; - -- /* -- * When many tasks blow up the sched_period; it is possible that -- * sched_slice() reports unusually large results (when many tasks are -- * very light for example). Therefore impose a maximum. -- */ -- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); -- - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; -- if (delta_exec > ideal_runtime) { -+ if (delta_exec > curr->slice) { - resched_curr(rq_of(cfs_rq)); - /* - * The current task ran long enough, ensure it doesn't get -@@ -5243,7 +5473,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) - if (delta < 0) - return; - -- if (delta > ideal_runtime) -+ if (delta > curr->slice) - resched_curr(rq_of(cfs_rq)); - } - -@@ -5298,17 +5528,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); - static struct sched_entity * - pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) - { -- struct sched_entity *left = __pick_first_entity(cfs_rq); -- struct sched_entity *se; -+ struct sched_entity *left, *se; - -- /* -- * If curr is set we have to see if its left of the leftmost entity -- * still in the tree, provided there was anything in the tree at all. -- */ -- if (!left || (curr && entity_before(curr, left))) -- left = curr; -+ if (sched_feat(EEVDF)) { -+ /* -+ * Enabling NEXT_BUDDY will affect latency but not fairness. -+ */ -+ if (sched_feat(NEXT_BUDDY) && -+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) -+ return cfs_rq->next; -+ -+ return pick_eevdf(cfs_rq); -+ } - -- se = left; /* ideally we run the leftmost entity */ -+ se = left = pick_cfs(cfs_rq, curr); - - /* - * Avoid running the skip buddy, if running something else can -@@ -5401,7 +5634,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) - return; - #endif - -- if (cfs_rq->nr_running > 1) -+ if (!sched_feat(EEVDF) && cfs_rq->nr_running > 1) - check_preempt_tick(cfs_rq, curr); - } - -@@ -6445,13 +6678,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} - static void hrtick_start_fair(struct rq *rq, struct task_struct *p) - { - struct sched_entity *se = &p->se; -- struct cfs_rq *cfs_rq = cfs_rq_of(se); - - SCHED_WARN_ON(task_rq(p) != rq); - - if (rq->cfs.h_nr_running > 1) { -- u64 slice = sched_slice(cfs_rq, se); - u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; -+ u64 slice = se->slice; - s64 delta = slice - ran; - - if (delta < 0) { -@@ -8228,7 +8460,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ - if (cse_is_idle != pse_is_idle) - return; - -- update_curr(cfs_rq_of(se)); -+ cfs_rq = cfs_rq_of(se); -+ update_curr(cfs_rq); -+ -+ if (sched_feat(EEVDF)) { -+ /* -+ * XXX pick_eevdf(cfs_rq) != se ? -+ */ -+ if (pick_eevdf(cfs_rq) == pse) -+ goto preempt; -+ -+ return; -+ } -+ - if (wakeup_preempt_entity(se, pse) == 1) { - /* - * Bias pick_next to pick the sched entity that is -@@ -8474,7 +8718,7 @@ static void yield_task_fair(struct rq *rq) - - clear_buddies(cfs_rq, se); - -- if (curr->policy != SCHED_BATCH) { -+ if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. -@@ -8487,6 +8731,8 @@ static void yield_task_fair(struct rq *rq) - */ - rq_clock_skip_update(rq); - } -+ if (sched_feat(EEVDF)) -+ se->deadline += calc_delta_fair(se->slice, se); - - set_skip_buddy(se); - } -@@ -12363,8 +12609,8 @@ static void rq_offline_fair(struct rq *rq) - static inline bool - __entity_slice_used(struct sched_entity *se, int min_nr_tasks) - { -- u64 slice = sched_slice(cfs_rq_of(se), se); - u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; -+ u64 slice = se->slice; - - return (rtime * min_nr_tasks > slice); - } -@@ -13059,7 +13305,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task - * idle runqueue: - */ - if (rq->cfs.load.weight) -- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); -+ rr_interval = NS_TO_JIFFIES(se->slice); - - return rr_interval; - } -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 7958a10fe23bb..60cce1e6f37b6 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -13,6 +13,7 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) - * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. - */ - SCHED_FEAT(PLACE_LAG, true) -+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) - - /* - * Prefer to schedule the task we woke last (assuming it failed -@@ -103,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false) - - SCHED_FEAT(ALT_PERIOD, true) - SCHED_FEAT(BASE_SLICE, true) -+ -+SCHED_FEAT(EEVDF, true) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 52a0a4bde1939..aa5b293ca4ed3 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2505,9 +2505,10 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); - extern const_debug unsigned int sysctl_sched_nr_migrate; - extern const_debug unsigned int sysctl_sched_migration_cost; - -+extern unsigned int sysctl_sched_min_granularity; -+ - #ifdef CONFIG_SCHED_DEBUG - extern unsigned int sysctl_sched_latency; --extern unsigned int sysctl_sched_min_granularity; - extern unsigned int sysctl_sched_idle_min_granularity; - extern unsigned int sysctl_sched_wakeup_granularity; - extern int sysctl_resched_latency_warn_ms; -@@ -3487,5 +3488,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } - #endif - - extern u64 avg_vruntime(struct cfs_rq *cfs_rq); -+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); - - #endif /* _KERNEL_SCHED_SCHED_H */ --- -cgit - -From 76cae9dbe185b82aeb0640aa2b73da4a8e0088ce Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 31 May 2023 13:58:45 +0200 -Subject: sched/fair: Commit to lag based placement - -Removes the FAIR_SLEEPERS code in favour of the new LAG based -placement. - -Specifically, the whole FAIR_SLEEPER thing was a very crude -approximation to make up for the lack of lag based placement, -specifically the 'service owed' part. This is important for things -like 'starve' and 'hackbench'. - -One side effect of FAIR_SLEEPER is that it caused 'small' unfairness, -specifically, by always ignoring up-to 'thresh' sleeptime it would -have a 50%/50% time distribution for a 50% sleeper vs a 100% runner, -while strictly speaking this should (of course) result in a 33%/67% -split (as CFS will also do if the sleep period exceeds 'thresh'). - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230531124604.000198861@infradead.org ---- - kernel/sched/fair.c | 59 +------------------------------------------------ - kernel/sched/features.h | 8 ------- - 2 files changed, 1 insertion(+), 66 deletions(-) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 4d3505dba476e..58798dae11b60 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -5068,29 +5068,6 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) - #endif - } - --static inline bool entity_is_long_sleeper(struct sched_entity *se) --{ -- struct cfs_rq *cfs_rq; -- u64 sleep_time; -- -- if (se->exec_start == 0) -- return false; -- -- cfs_rq = cfs_rq_of(se); -- -- sleep_time = rq_clock_task(rq_of(cfs_rq)); -- -- /* Happen while migrating because of clock task divergence */ -- if (sleep_time <= se->exec_start) -- return false; -- -- sleep_time -= se->exec_start; -- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) -- return true; -- -- return false; --} -- - static void - place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - { -@@ -5172,43 +5149,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - if (WARN_ON_ONCE(!load)) - load = 1; - lag = div_s64(lag, load); -- -- vruntime -= lag; -- } -- -- if (sched_feat(FAIR_SLEEPERS)) { -- -- /* sleeps up to a single latency don't count. */ -- if (!initial) { -- unsigned long thresh; -- -- if (se_is_idle(se)) -- thresh = sysctl_sched_min_granularity; -- else -- thresh = sysctl_sched_latency; -- -- /* -- * Halve their sleep time's effect, to allow -- * for a gentler effect of sleepers: -- */ -- if (sched_feat(GENTLE_FAIR_SLEEPERS)) -- thresh >>= 1; -- -- vruntime -= thresh; -- } -- -- /* -- * Pull vruntime of the entity being placed to the base level of -- * cfs_rq, to prevent boosting it if placed backwards. If the entity -- * slept for a long time, don't even try to compare its vruntime with -- * the base as it may be too far off and the comparison may get -- * inversed due to s64 overflow. -- */ -- if (!entity_is_long_sleeper(se)) -- vruntime = max_vruntime(se->vruntime, vruntime); - } - -- se->vruntime = vruntime; -+ se->vruntime = vruntime - lag; - - /* - * When joining the competition; the exisiting tasks will be, -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 60cce1e6f37b6..2a830eccda3e9 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -1,13 +1,5 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - --/* -- * Only give sleepers 50% of their service deficit. This allows -- * them to run sooner, but does not allow tons of sleepers to -- * rip the spread apart. -- */ --SCHED_FEAT(FAIR_SLEEPERS, false) --SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) -- - /* - * Using the avg_vruntime, do the right thing and preserve lag across - * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. --- -cgit - -From e8f331bcc270354a803c2127c486190d33eac441 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 31 May 2023 13:58:46 +0200 -Subject: sched/smp: Use lag to simplify cross-runqueue placement - -Using lag is both more correct and simpler when moving between -runqueues. - -Notable, min_vruntime() was invented as a cheap approximation of -avg_vruntime() for this very purpose (SMP migration). Since we now -have the real thing; use it. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230531124604.068911180@infradead.org ---- - kernel/sched/fair.c | 145 +++++++--------------------------------------------- - 1 file changed, 19 insertions(+), 126 deletions(-) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 58798dae11b60..57e8bc14b06ee 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -5083,7 +5083,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - * - * EEVDF: placement strategy #1 / #2 - */ -- if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) { -+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { - struct sched_entity *curr = cfs_rq->curr; - unsigned long load; - -@@ -5172,60 +5172,20 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static inline bool cfs_bandwidth_used(void); @@ -1599,7 +1492,7 @@ index 58798dae11b60..57e8bc14b06ee 100644 - if (renorm && curr) - se->vruntime += cfs_rq->min_vruntime; + if (curr) -+ place_entity(cfs_rq, se, 0); ++ place_entity(cfs_rq, se, flags); update_curr(cfs_rq); @@ -1615,7 +1508,7 @@ index 58798dae11b60..57e8bc14b06ee 100644 /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. -@@ -5237,11 +5197,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -4855,18 +5053,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); se_update_runnable(se); @@ -1625,473 +1518,21 @@ index 58798dae11b60..57e8bc14b06ee 100644 + * undo/redo all that. Seems wasteful. + */ update_cfs_group(se); -- account_entity_enqueue(cfs_rq, se); - -- if (flags & ENQUEUE_WAKEUP) ++ + /* + * XXX now that the entity has been re-weighted, and it's lag adjusted, + * we can place the entity. + */ + if (!curr) - place_entity(cfs_rq, se, 0); -+ -+ account_entity_enqueue(cfs_rq, se); ++ place_entity(cfs_rq, se, flags); + + account_entity_enqueue(cfs_rq, se); + +- if (flags & ENQUEUE_WAKEUP) +- place_entity(cfs_rq, se, 0); /* Entity has migrated, no longer consider this task hot */ if (flags & ENQUEUE_MIGRATED) se->exec_start = 0; -@@ -5346,23 +5317,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - - clear_buddies(cfs_rq, se); - -- if (flags & DEQUEUE_SLEEP) -- update_entity_lag(cfs_rq, se); -- -+ update_entity_lag(cfs_rq, se); - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); - se->on_rq = 0; - account_entity_dequeue(cfs_rq, se); - -- /* -- * Normalize after update_curr(); which will also have moved -- * min_vruntime if @se is the one holding it back. But before doing -- * update_min_vruntime() again, which will discount @se's position and -- * can move min_vruntime forward still more. -- */ -- if (!(flags & DEQUEUE_SLEEP)) -- se->vruntime -= cfs_rq->min_vruntime; -- - /* return excess runtime on last dequeue */ - return_cfs_rq_runtime(cfs_rq); - -@@ -8208,18 +8168,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) - { - struct sched_entity *se = &p->se; - -- /* -- * As blocked tasks retain absolute vruntime the migration needs to -- * deal with this by subtracting the old and adding the new -- * min_vruntime -- the latter is done by enqueue_entity() when placing -- * the task on the new runqueue. -- */ -- if (READ_ONCE(p->__state) == TASK_WAKING) { -- struct cfs_rq *cfs_rq = cfs_rq_of(se); -- -- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); -- } -- - if (!task_on_rq_migrating(p)) { - remove_entity_load_avg(se); - -@@ -12709,8 +12657,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) - */ - static void task_fork_fair(struct task_struct *p) - { -- struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, *curr; -+ struct cfs_rq *cfs_rq; - struct rq *rq = this_rq(); - struct rq_flags rf; - -@@ -12719,22 +12667,9 @@ static void task_fork_fair(struct task_struct *p) - - cfs_rq = task_cfs_rq(current); - curr = cfs_rq->curr; -- if (curr) { -+ if (curr) - update_curr(cfs_rq); -- se->vruntime = curr->vruntime; -- } - place_entity(cfs_rq, se, 1); -- -- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { -- /* -- * Upon rescheduling, sched_class::put_prev_task() will place -- * 'current' within the tree based on its new key value. -- */ -- swap(curr->vruntime, se->vruntime); -- resched_curr(rq); -- } -- -- se->vruntime -= cfs_rq->min_vruntime; - rq_unlock(rq, &rf); - } - -@@ -12763,34 +12698,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) - check_preempt_curr(rq, p, 0); - } - --static inline bool vruntime_normalized(struct task_struct *p) --{ -- struct sched_entity *se = &p->se; -- -- /* -- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, -- * the dequeue_entity(.flags=0) will already have normalized the -- * vruntime. -- */ -- if (p->on_rq) -- return true; -- -- /* -- * When !on_rq, vruntime of the task has usually NOT been normalized. -- * But there are some cases where it has already been normalized: -- * -- * - A forked child which is waiting for being woken up by -- * wake_up_new_task(). -- * - A task which has been woken up by try_to_wake_up() and -- * waiting for actually being woken up by sched_ttwu_pending(). -- */ -- if (!se->sum_exec_runtime || -- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup)) -- return true; -- -- return false; --} -- - #ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Propagate the changes of the sched_entity across the tg tree to make it -@@ -12861,16 +12768,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) - static void detach_task_cfs_rq(struct task_struct *p) - { - struct sched_entity *se = &p->se; -- struct cfs_rq *cfs_rq = cfs_rq_of(se); -- -- if (!vruntime_normalized(p)) { -- /* -- * Fix up our vruntime so that the current sleep doesn't -- * cause 'unlimited' sleep bonus. -- */ -- place_entity(cfs_rq, se, 0); -- se->vruntime -= cfs_rq->min_vruntime; -- } - - detach_entity_cfs_rq(se); - } -@@ -12878,12 +12775,8 @@ static void detach_task_cfs_rq(struct task_struct *p) - static void attach_task_cfs_rq(struct task_struct *p) - { - struct sched_entity *se = &p->se; -- struct cfs_rq *cfs_rq = cfs_rq_of(se); - - attach_entity_cfs_rq(se); -- -- if (!vruntime_normalized(p)) -- se->vruntime += cfs_rq->min_vruntime; - } - - static void switched_from_fair(struct rq *rq, struct task_struct *p) --- -cgit - -From 5e963f2bd4654a202a8a05aa3a86cb0300b10e6c Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 31 May 2023 13:58:47 +0200 -Subject: sched/fair: Commit to EEVDF - -EEVDF is a better defined scheduling policy, as a result it has less -heuristics/tunables. There is no compelling reason to keep CFS around. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230531124604.137187212@infradead.org ---- - kernel/sched/debug.c | 6 - - kernel/sched/fair.c | 465 ++++-------------------------------------------- - kernel/sched/features.h | 12 -- - kernel/sched/sched.h | 5 - - 4 files changed, 38 insertions(+), 450 deletions(-) - -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 18efc6d0cc5ab..f8d190c7c8c0d 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -347,10 +347,7 @@ static __init int sched_init_debug(void) - debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); - #endif - -- debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); - debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); -- debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); -- debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); - - debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); - debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -@@ -866,10 +863,7 @@ static void sched_debug_header(struct seq_file *m) - SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) - #define PN(x) \ - SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) -- PN(sysctl_sched_latency); - PN(sysctl_sched_min_granularity); -- PN(sysctl_sched_idle_min_granularity); -- PN(sysctl_sched_wakeup_granularity); - P(sysctl_sched_child_runs_first); - P(sysctl_sched_features); - #undef PN -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 57e8bc14b06ee..0605eb45c58aa 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -57,22 +57,6 @@ - #include "stats.h" - #include "autogroup.h" - --/* -- * Targeted preemption latency for CPU-bound tasks: -- * -- * NOTE: this latency value is not the same as the concept of -- * 'timeslice length' - timeslices in CFS are of variable length -- * and have no persistent notion like in traditional, time-slice -- * based scheduling concepts. -- * -- * (to see the precise effective timeslice length of your workload, -- * run vmstat and monitor the context-switches (cs) field) -- * -- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) -- */ --unsigned int sysctl_sched_latency = 6000000ULL; --static unsigned int normalized_sysctl_sched_latency = 6000000ULL; -- - /* - * The initial- and re-scaling of tunables is configurable - * -@@ -94,37 +78,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; - unsigned int sysctl_sched_min_granularity = 750000ULL; - static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; - --/* -- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. -- * Applies only when SCHED_IDLE tasks compete with normal tasks. -- * -- * (default: 0.75 msec) -- */ --unsigned int sysctl_sched_idle_min_granularity = 750000ULL; -- --/* -- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity -- */ --static unsigned int sched_nr_latency = 8; -- - /* - * After fork, child runs first. If set to 0 (default) then - * parent will (try to) run first. - */ - unsigned int sysctl_sched_child_runs_first __read_mostly; - --/* -- * SCHED_OTHER wake-up granularity. -- * -- * This option delays the preemption effects of decoupled workloads -- * and reduces their over-scheduling. Synchronous workloads will still -- * have immediate wakeup/sleep latencies. -- * -- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) -- */ --unsigned int sysctl_sched_wakeup_granularity = 1000000UL; --static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; -- - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; - - int sched_thermal_decay_shift; -@@ -279,8 +238,6 @@ static void update_sysctl(void) - #define SET_SYSCTL(name) \ - (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); -- SET_SYSCTL(sched_latency); -- SET_SYSCTL(sched_wakeup_granularity); - #undef SET_SYSCTL - } - -@@ -888,30 +845,6 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) - return __node_2_se(left); - } - --static struct sched_entity *__pick_next_entity(struct sched_entity *se) --{ -- struct rb_node *next = rb_next(&se->run_node); -- -- if (!next) -- return NULL; -- -- return __node_2_se(next); --} -- --static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr) --{ -- struct sched_entity *left = __pick_first_entity(cfs_rq); -- -- /* -- * If curr is set we have to see if its left of the leftmost entity -- * still in the tree, provided there was anything in the tree at all. -- */ -- if (!left || (curr && entity_before(curr, left))) -- left = curr; -- -- return left; --} -- - /* - * Earliest Eligible Virtual Deadline First - * -@@ -1008,85 +941,15 @@ int sched_update_scaling(void) - { - unsigned int factor = get_update_sysctl_factor(); - -- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, -- sysctl_sched_min_granularity); -- - #define WRT_SYSCTL(name) \ - (normalized_sysctl_##name = sysctl_##name / (factor)) - WRT_SYSCTL(sched_min_granularity); -- WRT_SYSCTL(sched_latency); -- WRT_SYSCTL(sched_wakeup_granularity); - #undef WRT_SYSCTL - - return 0; - } - #endif - --/* -- * The idea is to set a period in which each task runs once. -- * -- * When there are too many tasks (sched_nr_latency) we have to stretch -- * this period because otherwise the slices get too small. -- * -- * p = (nr <= nl) ? l : l*nr/nl -- */ --static u64 __sched_period(unsigned long nr_running) --{ -- if (unlikely(nr_running > sched_nr_latency)) -- return nr_running * sysctl_sched_min_granularity; -- else -- return sysctl_sched_latency; --} -- --static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); -- --/* -- * We calculate the wall-time slice from the period by taking a part -- * proportional to the weight. -- * -- * s = p*P[w/rw] -- */ --static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) --{ -- unsigned int nr_running = cfs_rq->nr_running; -- struct sched_entity *init_se = se; -- unsigned int min_gran; -- u64 slice; -- -- if (sched_feat(ALT_PERIOD)) -- nr_running = rq_of(cfs_rq)->cfs.h_nr_running; -- -- slice = __sched_period(nr_running + !se->on_rq); -- -- for_each_sched_entity(se) { -- struct load_weight *load; -- struct load_weight lw; -- struct cfs_rq *qcfs_rq; -- -- qcfs_rq = cfs_rq_of(se); -- load = &qcfs_rq->load; -- -- if (unlikely(!se->on_rq)) { -- lw = qcfs_rq->load; -- -- update_load_add(&lw, se->load.weight); -- load = &lw; -- } -- slice = __calc_delta(slice, se->load.weight, load); -- } -- -- if (sched_feat(BASE_SLICE)) { -- if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) -- min_gran = sysctl_sched_idle_min_granularity; -- else -- min_gran = sysctl_sched_min_granularity; -- -- slice = max_t(u64, slice, min_gran); -- } -- -- return slice; --} -- - static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); - - /* -@@ -1098,35 +961,25 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) - if ((s64)(se->vruntime - se->deadline) < 0) - return; - -- if (sched_feat(EEVDF)) { -- /* -- * For EEVDF the virtual time slope is determined by w_i (iow. -- * nice) while the request time r_i is determined by -- * sysctl_sched_min_granularity. -- */ -- se->slice = sysctl_sched_min_granularity; -- -- /* -- * The task has consumed its request, reschedule. -- */ -- if (cfs_rq->nr_running > 1) { -- resched_curr(rq_of(cfs_rq)); -- clear_buddies(cfs_rq, se); -- } -- } else { -- /* -- * When many tasks blow up the sched_period; it is possible -- * that sched_slice() reports unusually large results (when -- * many tasks are very light for example). Therefore impose a -- * maximum. -- */ -- se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency); -- } -+ /* -+ * For EEVDF the virtual time slope is determined by w_i (iow. -+ * nice) while the request time r_i is determined by -+ * sysctl_sched_min_granularity. -+ */ -+ se->slice = sysctl_sched_min_granularity; - - /* - * EEVDF: vd_i = ve_i + r_i / w_i - */ - se->deadline = se->vruntime + calc_delta_fair(se->slice, se); -+ -+ /* -+ * The task has consumed its request, reschedule. -+ */ -+ if (cfs_rq->nr_running > 1) { -+ resched_curr(rq_of(cfs_rq)); -+ clear_buddies(cfs_rq, se); -+ } - } - - #include "pelt.h" -@@ -5055,19 +4908,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - - #endif /* CONFIG_SMP */ - --static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) --{ --#ifdef CONFIG_SCHED_DEBUG -- s64 d = se->vruntime - cfs_rq->min_vruntime; -- -- if (d < 0) -- d = -d; -- -- if (d > 3*sysctl_sched_latency) -- schedstat_inc(cfs_rq->nr_spread_over); --#endif --} -- - static void - place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - { -@@ -5219,7 +5059,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) check_schedstat_required(); update_stats_enqueue_fair(cfs_rq, se, flags); @@ -2099,7 +1540,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; -@@ -5241,17 +5080,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -4878,17 +5086,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } } @@ -2117,7 +1558,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 static void __clear_buddies_next(struct sched_entity *se) { for_each_sched_entity(se) { -@@ -5263,27 +5091,10 @@ static void __clear_buddies_next(struct sched_entity *se) +@@ -4900,27 +5097,10 @@ static void __clear_buddies_next(struct sched_entity *se) } } @@ -2145,7 +1586,29 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 } static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -5341,45 +5152,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -4954,20 +5134,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + clear_buddies(cfs_rq, se); + ++ update_entity_lag(cfs_rq, se); + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + account_entity_dequeue(cfs_rq, se); + +- /* +- * Normalize after update_curr(); which will also have moved +- * min_vruntime if @se is the one holding it back. But before doing +- * update_min_vruntime() again, which will discount @se's position and +- * can move min_vruntime forward still more. +- */ +- if (!(flags & DEQUEUE_SLEEP)) +- se->vruntime -= cfs_rq->min_vruntime; +- + /* return excess runtime on last dequeue */ + return_cfs_rq_runtime(cfs_rq); + +@@ -4986,52 +5158,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_idle_cfs_rq_clock_pelt(cfs_rq); } @@ -2155,12 +1618,19 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 -static void -check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ -- unsigned long delta_exec; +- unsigned long ideal_runtime, delta_exec; - struct sched_entity *se; - s64 delta; - +- /* +- * When many tasks blow up the sched_period; it is possible that +- * sched_slice() reports unusually large results (when many tasks are +- * very light for example). Therefore impose a maximum. +- */ +- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); +- - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; -- if (delta_exec > curr->slice) { +- if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); - /* - * The current task ran long enough, ensure it doesn't get @@ -2184,14 +1654,26 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 - if (delta < 0) - return; - -- if (delta > curr->slice) +- if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); -} - static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { -@@ -5418,9 +5190,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -5047,6 +5173,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + update_stats_wait_end_fair(cfs_rq, se); + __dequeue_entity(cfs_rq, se); + update_load_avg(cfs_rq, se, UPDATE_TG); ++ /* ++ * HACK, stash a copy of deadline at the point of pick in vlag, ++ * which isn't used until dequeue. ++ */ ++ se->vlag = se->deadline; + } + + update_stats_curr_start(cfs_rq, se); +@@ -5070,9 +5201,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -2201,24 +1683,21 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups -@@ -5431,53 +5200,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +@@ -5083,50 +5211,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { -- struct sched_entity *left, *se; +- struct sched_entity *left = __pick_first_entity(cfs_rq); +- struct sched_entity *se; - -- if (sched_feat(EEVDF)) { -- /* -- * Enabling NEXT_BUDDY will affect latency but not fairness. -- */ -- if (sched_feat(NEXT_BUDDY) && -- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) -- return cfs_rq->next; +- /* +- * If curr is set we have to see if its left of the leftmost entity +- * still in the tree, provided there was anything in the tree at all. +- */ +- if (!left || (curr && entity_before(curr, left))) +- left = curr; - -- return pick_eevdf(cfs_rq); -- } -- -- se = left = pick_cfs(cfs_rq, curr); +- se = left; /* ideally we run the leftmost entity */ - /* - * Avoid running the skip buddy, if running something else can @@ -2227,10 +1706,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 */ - if (cfs_rq->skip && cfs_rq->skip == se) { - struct sched_entity *second; -+ if (sched_feat(NEXT_BUDDY) && -+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) -+ return cfs_rq->next; - +- - if (se == curr) { - second = __pick_first_entity(cfs_rq); - } else { @@ -2242,7 +1718,10 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 - if (second && wakeup_preempt_entity(second, left) < 1) - se = second; - } -- ++ if (sched_feat(NEXT_BUDDY) && ++ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) ++ return cfs_rq->next; + - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { - /* - * Someone really wants this to run. If it's not unfair, run it. @@ -2260,7 +1739,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -5494,8 +5224,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) +@@ -5143,8 +5235,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); @@ -2269,17 +1748,32 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 if (prev->on_rq) { update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ -@@ -5536,9 +5264,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +@@ -5185,9 +5275,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return; #endif - -- if (!sched_feat(EEVDF) && cfs_rq->nr_running > 1) +- if (cfs_rq->nr_running > 1) - check_preempt_tick(cfs_rq, curr); } -@@ -6610,8 +6335,7 @@ static void hrtick_update(struct rq *rq) +@@ -6210,13 +6297,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} + static void hrtick_start_fair(struct rq *rq, struct task_struct *p) + { + struct sched_entity *se = &p->se; +- struct cfs_rq *cfs_rq = cfs_rq_of(se); + + SCHED_WARN_ON(task_rq(p) != rq); + + if (rq->cfs.h_nr_running > 1) { +- u64 slice = sched_slice(cfs_rq, se); + u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; ++ u64 slice = se->slice; + s64 delta = slice - ran; + + if (delta < 0) { +@@ -6240,8 +6326,7 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; @@ -2289,7 +1783,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 } #else /* !CONFIG_SCHED_HRTICK */ static inline void -@@ -6652,17 +6376,6 @@ static int sched_idle_rq(struct rq *rq) +@@ -6282,17 +6367,6 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } @@ -2307,7 +1801,26 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { -@@ -8205,66 +7918,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +@@ -7795,18 +7869,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) + { + struct sched_entity *se = &p->se; + +- /* +- * As blocked tasks retain absolute vruntime the migration needs to +- * deal with this by subtracting the old and adding the new +- * min_vruntime -- the latter is done by enqueue_entity() when placing +- * the task on the new runqueue. +- */ +- if (READ_ONCE(p->__state) == TASK_WAKING) { +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- +- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); +- } +- + if (!task_on_rq_migrating(p)) { + remove_entity_load_avg(se); + +@@ -7844,66 +7906,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ @@ -2374,7 +1887,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { -@@ -8276,12 +7929,6 @@ static void set_next_buddy(struct sched_entity *se) +@@ -7915,12 +7917,6 @@ static void set_next_buddy(struct sched_entity *se) } } @@ -2387,7 +1900,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 /* * Preempt the current task with a newly woken task if needed: */ -@@ -8290,7 +7937,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7929,7 +7925,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); @@ -2395,7 +1908,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 int next_buddy_marked = 0; int cse_is_idle, pse_is_idle; -@@ -8306,7 +7952,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7945,7 +7940,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; @@ -2404,20 +1917,11 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 set_next_buddy(pse); next_buddy_marked = 1; } -@@ -8354,44 +8000,16 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ - cfs_rq = cfs_rq_of(se); - update_curr(cfs_rq); +@@ -7990,35 +7985,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + if (cse_is_idle != pse_is_idle) + return; -- if (sched_feat(EEVDF)) { -- /* -- * XXX pick_eevdf(cfs_rq) != se ? -- */ -- if (pick_eevdf(cfs_rq) == pse) -- goto preempt; -- -- return; -- } -- +- update_curr(cfs_rq_of(se)); - if (wakeup_preempt_entity(se, pse) == 1) { - /* - * Bias pick_next to pick the sched entity that is @@ -2425,6 +1929,9 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 - */ - if (!next_buddy_marked) - set_next_buddy(pse); ++ cfs_rq = cfs_rq_of(se); ++ update_curr(cfs_rq); ++ + /* + * XXX pick_eevdf(cfs_rq) != se ? + */ @@ -2453,7 +1960,22 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 } #ifdef CONFIG_SMP -@@ -8592,8 +8210,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +@@ -8203,6 +8182,14 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq) + return pick_next_task_fair(rq, NULL, NULL); + } + ++static bool eligible_task_fair(struct rq *rq, struct task_struct *p) ++{ ++ struct sched_entity *se = &p->se; ++ struct cfs_rq *cfs_rq = cfs_rq_of(se); ++ ++ return entity_eligible(cfs_rq, se); ++} ++ + /* + * Account for a descheduled task: + */ +@@ -8219,8 +8206,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) /* * sched_yield() is very simple @@ -2462,11 +1984,11 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 */ static void yield_task_fair(struct rq *rq) { -@@ -8609,23 +8225,19 @@ static void yield_task_fair(struct rq *rq) +@@ -8236,21 +8221,19 @@ static void yield_task_fair(struct rq *rq) clear_buddies(cfs_rq, se); -- if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) { +- if (curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. @@ -2479,8 +2001,6 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 - */ - rq_clock_skip_update(rq); - } -- if (sched_feat(EEVDF)) -- se->deadline += calc_delta_fair(se->slice, se); + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. @@ -2498,7 +2018,7 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) -@@ -8873,8 +8485,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) +@@ -8493,8 +8476,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) * Buddy candidates are cache hot: */ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && @@ -2508,858 +2028,117 @@ index 57e8bc14b06ee..0605eb45c58aa 100644 return 1; if (sysctl_sched_migration_cost == -1) -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 2a830eccda3e9..54334ca5c5c61 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -14,13 +14,6 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) - */ - SCHED_FEAT(NEXT_BUDDY, false) - --/* -- * Prefer to schedule the task that ran last (when we did -- * wake-preempt) as that likely will touch the same data, increases -- * cache locality. -- */ --SCHED_FEAT(LAST_BUDDY, true) -- - /* - * Consider buddies to be cache hot, decreases the likeliness of a - * cache buddy being migrated away, increases cache locality. -@@ -93,8 +86,3 @@ SCHED_FEAT(UTIL_EST, true) - SCHED_FEAT(UTIL_EST_FASTUP, true) - - SCHED_FEAT(LATENCY_WARN, false) -- --SCHED_FEAT(ALT_PERIOD, true) --SCHED_FEAT(BASE_SLICE, true) -- --SCHED_FEAT(EEVDF, true) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index aa5b293ca4ed3..f814bb731235d 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -570,8 +570,6 @@ struct cfs_rq { - */ - struct sched_entity *curr; - struct sched_entity *next; -- struct sched_entity *last; -- struct sched_entity *skip; - - #ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -@@ -2508,9 +2506,6 @@ extern const_debug unsigned int sysctl_sched_migration_cost; - extern unsigned int sysctl_sched_min_granularity; - - #ifdef CONFIG_SCHED_DEBUG --extern unsigned int sysctl_sched_latency; --extern unsigned int sysctl_sched_idle_min_granularity; --extern unsigned int sysctl_sched_wakeup_granularity; - extern int sysctl_resched_latency_warn_ms; - extern int sysctl_resched_latency_warn_once; - --- -cgit - -From e4ec3318a17f5dcf11bc23b2d2c1da4c1c5bb507 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 31 May 2023 13:58:48 +0200 -Subject: sched/debug: Rename sysctl_sched_min_granularity to - sysctl_sched_base_slice - -EEVDF uses this tunable as the base request/slice -- make sure the -name reflects this. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230531124604.205287511@infradead.org ---- - kernel/sched/core.c | 2 +- - kernel/sched/debug.c | 4 ++-- - kernel/sched/fair.c | 12 ++++++------ - kernel/sched/sched.h | 2 +- - 4 files changed, 10 insertions(+), 10 deletions(-) - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index e85a2fd258e2b..a5d3422f7d0de 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -4502,7 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) - p->se.nr_migrations = 0; - p->se.vruntime = 0; - p->se.vlag = 0; -- p->se.slice = sysctl_sched_min_granularity; -+ p->se.slice = sysctl_sched_base_slice; - INIT_LIST_HEAD(&p->se.group_node); - - #ifdef CONFIG_FAIR_GROUP_SCHED -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index f8d190c7c8c0d..4c3d0d9f3db63 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -347,7 +347,7 @@ static __init int sched_init_debug(void) - debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); - #endif - -- debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); -+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); - - debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); - debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -@@ -863,7 +863,7 @@ static void sched_debug_header(struct seq_file *m) - SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) - #define PN(x) \ - SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) -- PN(sysctl_sched_min_granularity); -+ PN(sysctl_sched_base_slice); - P(sysctl_sched_child_runs_first); - P(sysctl_sched_features); - #undef PN -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 0605eb45c58aa..61747a25d06db 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -75,8 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; - * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ --unsigned int sysctl_sched_min_granularity = 750000ULL; --static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; -+unsigned int sysctl_sched_base_slice = 750000ULL; -+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; - - /* - * After fork, child runs first. If set to 0 (default) then -@@ -237,7 +237,7 @@ static void update_sysctl(void) - - #define SET_SYSCTL(name) \ - (sysctl_##name = (factor) * normalized_sysctl_##name) -- SET_SYSCTL(sched_min_granularity); -+ SET_SYSCTL(sched_base_slice); - #undef SET_SYSCTL - } - -@@ -943,7 +943,7 @@ int sched_update_scaling(void) - - #define WRT_SYSCTL(name) \ - (normalized_sysctl_##name = sysctl_##name / (factor)) -- WRT_SYSCTL(sched_min_granularity); -+ WRT_SYSCTL(sched_base_slice); - #undef WRT_SYSCTL - - return 0; -@@ -964,9 +964,9 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) - /* - * For EEVDF the virtual time slope is determined by w_i (iow. - * nice) while the request time r_i is determined by -- * sysctl_sched_min_granularity. -+ * sysctl_sched_base_slice. - */ -- se->slice = sysctl_sched_min_granularity; -+ se->slice = sysctl_sched_base_slice; - - /* - * EEVDF: vd_i = ve_i + r_i / w_i -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index f814bb731235d..7ff9965570e69 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2503,7 +2503,7 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); - extern const_debug unsigned int sysctl_sched_nr_migrate; - extern const_debug unsigned int sysctl_sched_migration_cost; - --extern unsigned int sysctl_sched_min_granularity; -+extern unsigned int sysctl_sched_base_slice; - - #ifdef CONFIG_SCHED_DEBUG - extern int sysctl_resched_latency_warn_ms; --- -cgit - -From d07f09a1f99cabbc86bc5c97d962eb8a466106b5 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 31 May 2023 13:58:49 +0200 -Subject: sched/fair: Propagate enqueue flags into place_entity() - -This allows place_entity() to consider ENQUEUE_WAKEUP and -ENQUEUE_MIGRATED. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Ingo Molnar -Link: https://lore.kernel.org/r/20230531124604.274010996@infradead.org ---- - kernel/sched/fair.c | 10 +++++----- - kernel/sched/sched.h | 1 + - 2 files changed, 6 insertions(+), 5 deletions(-) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 61747a25d06db..5c8c9f7d8496a 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -4909,7 +4909,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - #endif /* CONFIG_SMP */ - - static void --place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) -+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -12004,8 +11986,8 @@ static void rq_offline_fair(struct rq *rq) + static inline bool + __entity_slice_used(struct sched_entity *se, int min_nr_tasks) { - u64 vslice = calc_delta_fair(se->slice, se); - u64 vruntime = avg_vruntime(cfs_rq); -@@ -4998,7 +4998,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - * on average, halfway through their slice, as such start tasks - * off with half a slice to ease into the competition. - */ -- if (sched_feat(PLACE_DEADLINE_INITIAL) && initial) -+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) - vslice /= 2; +- u64 slice = sched_slice(cfs_rq_of(se), se); + u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; ++ u64 slice = se->slice; - /* -@@ -5022,7 +5022,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - * update_curr(). - */ - if (curr) -- place_entity(cfs_rq, se, 0); -+ place_entity(cfs_rq, se, flags); + return (rtime * min_nr_tasks > slice); + } +@@ -12161,8 +12143,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + */ + static void task_fork_fair(struct task_struct *p) + { +- struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se, *curr; ++ struct cfs_rq *cfs_rq; + struct rq *rq = this_rq(); + struct rq_flags rf; - update_curr(cfs_rq); +@@ -12171,22 +12153,9 @@ static void task_fork_fair(struct task_struct *p) -@@ -5049,7 +5049,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - * we can place the entity. - */ - if (!curr) -- place_entity(cfs_rq, se, 0); -+ place_entity(cfs_rq, se, flags); - - account_entity_enqueue(cfs_rq, se); - -@@ -12280,7 +12280,7 @@ static void task_fork_fair(struct task_struct *p) + cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (curr) +- if (curr) { ++ if (curr) update_curr(cfs_rq); +- se->vruntime = curr->vruntime; +- } - place_entity(cfs_rq, se, 1); +- +- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { +- /* +- * Upon rescheduling, sched_class::put_prev_task() will place +- * 'current' within the tree based on its new key value. +- */ +- swap(curr->vruntime, se->vruntime); +- resched_curr(rq); +- } +- +- se->vruntime -= cfs_rq->min_vruntime; + place_entity(cfs_rq, se, ENQUEUE_INITIAL); rq_unlock(rq, &rf); } -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 7ff9965570e69..db5853761b1f3 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2199,6 +2199,7 @@ extern const u32 sched_prio_to_wmult[40]; - #else - #define ENQUEUE_MIGRATED 0x00 - #endif -+#define ENQUEUE_INITIAL 0x80 - - #define RETRY_TASK ((void *)-1UL) - --- -cgit - -From 246c6d7ab4d042b185d7df71f437137d43cbb83a Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Sat, 25 Mar 2023 00:14:04 +0100 -Subject: sched/eevdf: Better handle mixed slice length - -In the case where (due to latency-nice) there are different request -sizes in the tree, the smaller requests tend to be dominated by the -larger. Also note how the EEVDF lag limits are based on r_max. - -Therefore; add a heuristic that for the mixed request size case, moves -smaller requests to placement strategy #2 which ensures they're -immidiately eligible and and due to their smaller (virtual) deadline -will cause preemption. - -NOTE: this relies on update_entity_lag() to impose lag limits above -a single slice. - -Signed-off-by: Peter Zijlstra (Intel) ---- - kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++++ - kernel/sched/features.h | 1 + - kernel/sched/sched.h | 1 + - 3 files changed, 41 insertions(+) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 5c8c9f7d8496a..16949f7bbb172 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) - s64 key = entity_key(cfs_rq, se); - - cfs_rq->avg_vruntime += key * weight; -+ cfs_rq->avg_slice += se->slice * weight; - cfs_rq->avg_load += weight; +@@ -12215,34 +12184,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) + check_preempt_curr(rq, p, 0); } -@@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) - s64 key = entity_key(cfs_rq, se); - - cfs_rq->avg_vruntime -= key * weight; -+ cfs_rq->avg_slice -= se->slice * weight; - cfs_rq->avg_load -= weight; - } - -@@ -4908,6 +4910,30 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - - #endif /* CONFIG_SMP */ - -+static inline bool -+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags) -+{ -+ u64 now, vdelta; -+ s64 delta; -+ -+ if (!(flags & ENQUEUE_WAKEUP)) -+ return false; -+ -+ if (flags & ENQUEUE_MIGRATED) -+ return true; -+ -+ now = rq_clock_task(rq_of(cfs_rq)); -+ delta = now - se->exec_start; -+ if (delta < 0) -+ return false; -+ -+ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load); -+ if (vdelta < vslice) -+ return false; -+ -+ return true; -+} -+ - static void - place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - { -@@ -4929,6 +4955,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - - lag = se->vlag; - -+ /* -+ * For latency sensitive tasks; those that have a shorter than -+ * average slice and do not fully consume the slice, transition -+ * to EEVDF placement strategy #2. -+ */ -+ if (sched_feat(PLACE_FUDGE) && -+ (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) && -+ entity_has_slept(cfs_rq, se, vslice, flags)) { -+ lag += vslice; -+ if (lag > 0) -+ lag = 0; -+ } -+ - /* - * If we want to place a task and preserve lag, we have to - * consider the effect of the new entity on the weighted -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 54334ca5c5c61..7d65b40299d91 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -5,6 +5,7 @@ - * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. - */ - SCHED_FEAT(PLACE_LAG, true) -+SCHED_FEAT(PLACE_FUDGE, true) - SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) - - /* -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index db5853761b1f3..bc45beee335c5 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -549,6 +549,7 @@ struct cfs_rq { - unsigned int idle_h_nr_running; /* SCHED_IDLE */ - - s64 avg_vruntime; -+ u64 avg_slice; - u64 avg_load; - - u64 exec_clock; --- -cgit - -From 36b9081885fee5764b53970dd2d6afe8c2f13b7f Mon Sep 17 00:00:00 2001 -From: Parth Shah -Date: Sat, 11 Mar 2023 12:20:21 +0100 -Subject: sched: Introduce latency-nice as a per-task attribute - -Latency-nice indicates the latency requirements of a task with respect -to the other tasks in the system. The value of the attribute can be within -the range of [-20, 19] both inclusive to be in-line with the values just -like task nice values. - -Just like task nice, -20 is the 'highest' priority and conveys this -task should get minimal latency, conversely 19 is the lowest priority -and conveys this task will get the least consideration and will thus -receive maximal latency. - -[peterz: rebase, squash] -Signed-off-by: Parth Shah -Signed-off-by: Peter Zijlstra (Intel) ---- - include/linux/sched.h | 1 + - include/uapi/linux/sched.h | 4 +++- - include/uapi/linux/sched/types.h | 19 +++++++++++++++++++ - init/init_task.c | 3 ++- - kernel/sched/core.c | 27 ++++++++++++++++++++++++++- - kernel/sched/debug.c | 1 + - tools/include/uapi/linux/sched.h | 4 +++- - 7 files changed, 55 insertions(+), 4 deletions(-) - -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 177b3f3676ef8..80bb40a63e9aa 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -790,6 +790,7 @@ struct task_struct { - int static_prio; - int normal_prio; - unsigned int rt_priority; -+ int latency_prio; - - struct sched_entity se; - struct sched_rt_entity rt; -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab26..b2e932c25be62 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -132,6 +132,7 @@ struct clone_args { - #define SCHED_FLAG_KEEP_PARAMS 0x10 - #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 - #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 -+#define SCHED_FLAG_LATENCY_NICE 0x80 - - #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ - SCHED_FLAG_KEEP_PARAMS) -@@ -143,6 +144,7 @@ struct clone_args { - SCHED_FLAG_RECLAIM | \ - SCHED_FLAG_DL_OVERRUN | \ - SCHED_FLAG_KEEP_ALL | \ -- SCHED_FLAG_UTIL_CLAMP) -+ SCHED_FLAG_UTIL_CLAMP | \ -+ SCHED_FLAG_LATENCY_NICE) - - #endif /* _UAPI_LINUX_SCHED_H */ -diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h -index f2c4589d4dbfe..db1e8199e8c80 100644 ---- a/include/uapi/linux/sched/types.h -+++ b/include/uapi/linux/sched/types.h -@@ -10,6 +10,7 @@ struct sched_param { - - #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ - #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ -+#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */ - - /* - * Extended scheduling parameters data structure. -@@ -98,6 +99,22 @@ struct sched_param { - * scheduled on a CPU with no more capacity than the specified value. - * - * A task utilization boundary can be reset by setting the attribute to -1. -+ * -+ * Latency Tolerance Attributes -+ * =========================== -+ * -+ * A subset of sched_attr attributes allows to specify the relative latency -+ * requirements of a task with respect to the other tasks running/queued in the -+ * system. -+ * -+ * @ sched_latency_nice task's latency_nice value -+ * -+ * The latency_nice of a task can have any value in a range of -+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE]. -+ * -+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be -+ * taken for a task requiring a lower latency as opposed to the task with -+ * higher latency_nice. - */ - struct sched_attr { - __u32 size; -@@ -120,6 +137,8 @@ struct sched_attr { - __u32 sched_util_min; - __u32 sched_util_max; - -+ /* latency requirement hints */ -+ __s32 sched_latency_nice; - }; - - #endif /* _UAPI_LINUX_SCHED_TYPES_H */ -diff --git a/init/init_task.c b/init/init_task.c -index ff6c4b9bfe6b1..511cbcf3510dc 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -78,6 +78,7 @@ struct task_struct init_task - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+ .latency_prio = DEFAULT_PRIO, - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .user_cpus_ptr = NULL, -@@ -89,7 +90,7 @@ struct task_struct init_task - .fn = do_no_restart_syscall, - }, - .se = { -- .group_node = LIST_HEAD_INIT(init_task.se.group_node), -+ .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, - .rt = { - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index a5d3422f7d0de..b3533d0d4a2ca 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -4757,6 +4757,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - p->prio = p->normal_prio = p->static_prio; - set_load_weight(p, false); - -+ p->latency_prio = NICE_TO_PRIO(0); -+ - /* - * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: -@@ -7531,7 +7533,7 @@ static struct task_struct *find_process_by_pid(pid_t pid) - #define SETPARAM_POLICY -1 - - static void __setscheduler_params(struct task_struct *p, -- const struct sched_attr *attr) -+ const struct sched_attr *attr) - { - int policy = attr->sched_policy; - -@@ -7555,6 +7557,13 @@ static void __setscheduler_params(struct task_struct *p, - set_load_weight(p, true); - } - -+static void __setscheduler_latency(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) -+ p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice); -+} -+ - /* - * Check the target process has a UID that matches the current process's: - */ -@@ -7689,6 +7698,13 @@ recheck: - return retval; - } - -+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { -+ if (attr->sched_latency_nice > MAX_NICE) -+ return -EINVAL; -+ if (attr->sched_latency_nice < MIN_NICE) -+ return -EINVAL; -+ } -+ - /* Update task specific "requested" clamps */ - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { - retval = uclamp_validate(p, attr); -@@ -7736,6 +7752,9 @@ recheck: - goto change; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) - goto change; -+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && -+ attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio)) -+ goto change; - - p->sched_reset_on_fork = reset_on_fork; - retval = 0; -@@ -7824,6 +7843,7 @@ change: - __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); - } -+ __setscheduler_latency(p, attr); - __setscheduler_uclamp(p, attr); - - if (queued) { -@@ -8035,6 +8055,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a - size < SCHED_ATTR_SIZE_VER1) - return -EINVAL; - -+ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) && -+ size < SCHED_ATTR_SIZE_VER2) -+ return -EINVAL; - /* - * XXX: Do we want to be lenient like existing syscalls; or do we want - * to be strict and return an error on out-of-bounds values? -@@ -8272,6 +8295,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - get_params(p, &kattr); - kattr.sched_flags &= SCHED_FLAG_ALL; - -+ kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio); -+ - #ifdef CONFIG_UCLAMP_TASK - /* - * This could race with another potential updater, but this is fine -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 4c3d0d9f3db63..5c743bcb340d2 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -1086,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, - #endif - P(policy); - P(prio); -+ P(latency_prio); - if (task_has_dl_policy(p)) { - P(dl.runtime); - P(dl.deadline); -diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h -index 3bac0a8ceab26..b2e932c25be62 100644 ---- a/tools/include/uapi/linux/sched.h -+++ b/tools/include/uapi/linux/sched.h -@@ -132,6 +132,7 @@ struct clone_args { - #define SCHED_FLAG_KEEP_PARAMS 0x10 - #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 - #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 -+#define SCHED_FLAG_LATENCY_NICE 0x80 - - #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ - SCHED_FLAG_KEEP_PARAMS) -@@ -143,6 +144,7 @@ struct clone_args { - SCHED_FLAG_RECLAIM | \ - SCHED_FLAG_DL_OVERRUN | \ - SCHED_FLAG_KEEP_ALL | \ -- SCHED_FLAG_UTIL_CLAMP) -+ SCHED_FLAG_UTIL_CLAMP | \ -+ SCHED_FLAG_LATENCY_NICE) - - #endif /* _UAPI_LINUX_SCHED_H */ --- -cgit - -From 9f9a3323112d3aa5afa466b1e391e137f28dc79d Mon Sep 17 00:00:00 2001 -From: "Peter Zijlstra (Intel)" -Date: Fri, 24 Feb 2023 10:34:51 +0100 -Subject: sched/fair: Implement latency-nice - -Implement latency-nice as a modulation of the EEVDF r_i parameter, -specifically apply the inverse sched_prio_to_weight[] relation on -base_slice. - -Given a base slice of 3 [ms], this gives a range of: - - latency-nice 19: 3*1024 / 15 ~= 204.8 [ms] - latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms] - -(which might not make sense) - -Signed-off-by: Peter Zijlstra (Intel) -Tested-by: K Prateek Nayak ---- - kernel/sched/core.c | 14 ++++++++++---- - kernel/sched/fair.c | 22 +++++++++++++++------- - kernel/sched/sched.h | 2 ++ - 3 files changed, 27 insertions(+), 11 deletions(-) - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index b3533d0d4a2ca..263caac8f76b7 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load) - } - } - -+static inline void set_latency_prio(struct task_struct *p, int prio) -+{ -+ p->latency_prio = prio; -+ set_latency_fair(&p->se, prio - MAX_RT_PRIO); -+} -+ - #ifdef CONFIG_UCLAMP_TASK - /* - * Serializes updates of utilization clamp values -@@ -4502,9 +4508,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) - p->se.nr_migrations = 0; - p->se.vruntime = 0; - p->se.vlag = 0; -- p->se.slice = sysctl_sched_base_slice; - INIT_LIST_HEAD(&p->se.group_node); - -+ set_latency_prio(p, p->latency_prio); -+ - #ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = NULL; - #endif -@@ -4756,8 +4763,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - - p->prio = p->normal_prio = p->static_prio; - set_load_weight(p, false); +-static inline bool vruntime_normalized(struct task_struct *p) +-{ +- struct sched_entity *se = &p->se; - -- p->latency_prio = NICE_TO_PRIO(0); -+ set_latency_prio(p, NICE_TO_PRIO(0)); - - /* - * We don't need the reset flag anymore after the fork. It has -@@ -7561,7 +7567,7 @@ static void __setscheduler_latency(struct task_struct *p, - const struct sched_attr *attr) - { - if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) -- p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice); -+ set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice)); - } - - /* -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 16949f7bbb172..c2019e7d46cf5 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -952,6 +952,21 @@ int sched_update_scaling(void) - } - #endif - -+void set_latency_fair(struct sched_entity *se, int prio) -+{ -+ u32 weight = sched_prio_to_weight[prio]; -+ u64 base = sysctl_sched_base_slice; -+ -+ /* -+ * For EEVDF the virtual time slope is determined by w_i (iow. -+ * nice) while the request time r_i is determined by -+ * latency-nice. -+ * -+ * Smaller request gets better latency. -+ */ -+ se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight); -+} -+ - static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); - - /* -@@ -963,13 +978,6 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) - if ((s64)(se->vruntime - se->deadline) < 0) - return; - - /* -- * For EEVDF the virtual time slope is determined by w_i (iow. -- * nice) while the request time r_i is determined by -- * sysctl_sched_base_slice. +- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, +- * the dequeue_entity(.flags=0) will already have normalized the +- * vruntime. - */ -- se->slice = sysctl_sched_base_slice; +- if (p->on_rq) +- return true; - - /* - * EEVDF: vd_i = ve_i + r_i / w_i - */ -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index bc45beee335c5..8f8d903a01892 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2520,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size; - extern unsigned int sysctl_numa_balancing_hot_threshold; - #endif - -+extern void set_latency_fair(struct sched_entity *se, int prio); -+ - #ifdef CONFIG_SCHED_HRTICK - +- /* +- * When !on_rq, vruntime of the task has usually NOT been normalized. +- * But there are some cases where it has already been normalized: +- * +- * - A forked child which is waiting for being woken up by +- * wake_up_new_task(). +- * - A task which has been woken up by try_to_wake_up() and +- * waiting for actually being woken up by sched_ttwu_pending(). +- */ +- if (!se->sum_exec_runtime || +- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup)) +- return true; +- +- return false; +-} +- + #ifdef CONFIG_FAIR_GROUP_SCHED /* --- -cgit - -From a317f35154852bc023a7ab2e3fa491e1897af72f Mon Sep 17 00:00:00 2001 -From: Vincent Guittot -Date: Fri, 24 Feb 2023 10:34:52 +0100 -Subject: sched/fair: Add sched group latency support - -Task can set its latency priority with sched_setattr(), which is then used -to set the latency offset of its sched_enity, but sched group entities -still have the default latency offset value. - -Add a latency.nice field in cpu cgroup controller to set the latency -priority of the group similarly to sched_setattr(). The latency priority -is then used to set the offset of the sched_entities of the group. - -Signed-off-by: Vincent Guittot -Signed-off-by: Peter Zijlstra (Intel) -Tested-by: K Prateek Nayak -Link: https://lkml.kernel.org/r/20230224093454.956298-7-vincent.guittot@linaro.org ---- - Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++ - kernel/sched/core.c | 30 ++++++++++++++++++++++++++++++ - kernel/sched/fair.c | 27 +++++++++++++++++++++++++++ - kernel/sched/sched.h | 4 ++++ - 4 files changed, 71 insertions(+) - -diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst -index 4ef8901911961..3a8d3e1e55910 100644 ---- a/Documentation/admin-guide/cgroup-v2.rst -+++ b/Documentation/admin-guide/cgroup-v2.rst -@@ -1121,6 +1121,16 @@ All time durations are in microseconds. - values similar to the sched_setattr(2). This maximum utilization - value is used to clamp the task specific maximum utilization clamp. - -+ cpu.latency.nice -+ A read-write single value file which exists on non-root -+ cgroups. The default is "0". -+ -+ The nice value is in the range [-20, 19]. -+ -+ This interface file allows reading and setting latency using the -+ same values used by sched_setattr(2). The latency_nice of a group is -+ used to limit the impact of the latency_nice of a task outside the -+ group. - - - Memory -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 263caac8f76b7..8a541fe2d4626 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -11247,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + * Propagate the changes of the sched_entity across the tg tree to make it +@@ -12313,16 +12254,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) + static void detach_task_cfs_rq(struct task_struct *p) { - return sched_group_set_idle(css_tg(css), idle); - } -+ -+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css, -+ struct cftype *cft) -+{ -+ return PRIO_TO_NICE(css_tg(css)->latency_prio); -+} -+ -+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css, -+ struct cftype *cft, s64 nice) -+{ -+ int prio; -+ -+ if (nice < MIN_NICE || nice > MAX_NICE) -+ return -ERANGE; -+ -+ prio = NICE_TO_PRIO(nice); -+ -+ return sched_group_set_latency(css_tg(css), prio); -+} - #endif + struct sched_entity *se = &p->se; +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- +- if (!vruntime_normalized(p)) { +- /* +- * Fix up our vruntime so that the current sleep doesn't +- * cause 'unlimited' sleep bonus. +- */ +- place_entity(cfs_rq, se, 0); +- se->vruntime -= cfs_rq->min_vruntime; +- } - static struct cftype cpu_legacy_files[] = { -@@ -11261,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = { - .read_s64 = cpu_idle_read_s64, - .write_s64 = cpu_idle_write_s64, - }, -+ { -+ .name = "latency.nice", -+ .read_s64 = cpu_latency_nice_read_s64, -+ .write_s64 = cpu_latency_nice_write_s64, -+ }, - #endif - #ifdef CONFIG_CFS_BANDWIDTH - { -@@ -11500,6 +11524,12 @@ static struct cftype cpu_files[] = { - .read_s64 = cpu_idle_read_s64, - .write_s64 = cpu_idle_write_s64, - }, -+ { -+ .name = "latency.nice", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .read_s64 = cpu_latency_nice_read_s64, -+ .write_s64 = cpu_latency_nice_write_s64, -+ }, - #endif - #ifdef CONFIG_CFS_BANDWIDTH - { -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index c2019e7d46cf5..8a4799c600309 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -12545,6 +12545,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) + detach_entity_cfs_rq(se); + } +@@ -12330,12 +12261,8 @@ static void detach_task_cfs_rq(struct task_struct *p) + static void attach_task_cfs_rq(struct task_struct *p) + { + struct sched_entity *se = &p->se; +- struct cfs_rq *cfs_rq = cfs_rq_of(se); + + attach_entity_cfs_rq(se); +- +- if (!vruntime_normalized(p)) +- se->vruntime += cfs_rq->min_vruntime; + } + + static void switched_from_fair(struct rq *rq, struct task_struct *p) +@@ -12446,6 +12373,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) goto err; tg->shares = NICE_0_LOAD; @@ -3367,7 +2146,7 @@ index c2019e7d46cf5..8a4799c600309 100644 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); -@@ -12643,6 +12644,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, +@@ -12544,6 +12472,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, } se->my_q = cfs_rq; @@ -3377,7 +2156,7 @@ index c2019e7d46cf5..8a4799c600309 100644 /* guarantee group entities always have weight */ update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; -@@ -12773,6 +12777,29 @@ next_cpu: +@@ -12674,6 +12605,29 @@ int sched_group_set_idle(struct task_group *tg, long idle) return 0; } @@ -3407,8 +2186,73 @@ index c2019e7d46cf5..8a4799c600309 100644 #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } +@@ -12700,7 +12654,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task + * idle runqueue: + */ + if (rq->cfs.load.weight) +- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); ++ rr_interval = NS_TO_JIFFIES(se->slice); + + return rr_interval; + } +@@ -12717,6 +12671,7 @@ DEFINE_SCHED_CLASS(fair) = { + + .check_preempt_curr = check_preempt_wakeup, + ++ .eligible_task = eligible_task_fair, + .pick_next_task = __pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index ee7f23c76..5ae5a6f92 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -1,16 +1,13 @@ + /* SPDX-License-Identifier: GPL-2.0 */ +-/* +- * Only give sleepers 50% of their service deficit. This allows +- * them to run sooner, but does not allow tons of sleepers to +- * rip the spread apart. +- */ +-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) + + /* +- * Place new tasks ahead so that they do not starve already running +- * tasks ++ * Using the avg_vruntime, do the right thing and preserve lag across ++ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. + */ +-SCHED_FEAT(START_DEBIT, true) ++SCHED_FEAT(PLACE_LAG, true) ++SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) ++SCHED_FEAT(RUN_TO_PARITY, true) ++SCHED_FEAT(DELAY_DEQUEUE, true) + + /* + * Prefer to schedule the task we woke last (assuming it failed +@@ -19,13 +16,6 @@ SCHED_FEAT(START_DEBIT, true) + */ + SCHED_FEAT(NEXT_BUDDY, false) + +-/* +- * Prefer to schedule the task that ran last (when we did +- * wake-preempt) as that likely will touch the same data, increases +- * cache locality. +- */ +-SCHED_FEAT(LAST_BUDDY, true) +- + /* + * Consider buddies to be cache hot, decreases the likeliness of a + * cache buddy being migrated away, increases cache locality. +@@ -98,6 +88,3 @@ SCHED_FEAT(UTIL_EST, true) + SCHED_FEAT(UTIL_EST_FASTUP, true) + + SCHED_FEAT(LATENCY_WARN, false) +- +-SCHED_FEAT(ALT_PERIOD, true) +-SCHED_FEAT(BASE_SLICE, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 8f8d903a01892..4236c4c893aa7 100644 +index e93e006a9..050b447d2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -372,6 +372,8 @@ struct task_group { @@ -3429,212 +2273,102 @@ index 8f8d903a01892..4236c4c893aa7 100644 #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); --- -cgit +@@ -548,6 +552,9 @@ struct cfs_rq { + unsigned int idle_nr_running; /* SCHED_IDLE */ + unsigned int idle_h_nr_running; /* SCHED_IDLE */ -From b412068f928064d23f67709f46d36d7659079e54 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Mon, 22 May 2023 13:46:30 +0200 -Subject: sched/eevdf: Use sched_attr::sched_runtime to set request/slice - -As an alternative to the latency-nice interface; allow applications to -directly set the request/slice using sched_attr::sched_runtime. - -The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms] -which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100. - -Applications should strive to use their periodic runtime at a high -confidence interval (95%+) as the target slice. Using a smaller slice -will introduce undue preemptions, while using a larger value will -increase latency. - -Signed-off-by: Peter Zijlstra (Intel) ---- - kernel/sched/core.c | 24 ++++++++++++++++++------ - 1 file changed, 18 insertions(+), 6 deletions(-) - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 8a541fe2d4626..5b71c398f6cf6 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -7548,10 +7548,18 @@ static void __setscheduler_params(struct task_struct *p, - - p->policy = policy; - -- if (dl_policy(policy)) -+ if (dl_policy(policy)) { - __setparam_dl(p, attr); -- else if (fair_policy(policy)) -+ } else if (fair_policy(policy)) { - p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ if (attr->sched_runtime) { -+ p->se.slice = clamp_t(u64, attr->sched_runtime, -+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ -+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */ -+ } else { -+ p->se.slice = sysctl_sched_base_slice; -+ } -+ } - - /* - * __sched_setscheduler() ensures attr->sched_priority == 0 when -@@ -7750,7 +7758,9 @@ recheck: - * but store a possible modification of reset_on_fork. ++ s64 avg_vruntime; ++ u64 avg_load; ++ + u64 exec_clock; + u64 min_vruntime; + #ifdef CONFIG_SCHED_CORE +@@ -567,8 +574,6 @@ struct cfs_rq { */ - if (unlikely(policy == p->policy)) { -- if (fair_policy(policy) && attr->sched_nice != task_nice(p)) -+ if (fair_policy(policy) && -+ (attr->sched_nice != task_nice(p) || -+ (attr->sched_runtime && attr->sched_runtime != p->se.slice))) - goto change; - if (rt_policy(policy) && attr->sched_priority != p->rt_priority) - goto change; -@@ -8079,12 +8089,14 @@ err_size: + struct sched_entity *curr; + struct sched_entity *next; +- struct sched_entity *last; +- struct sched_entity *skip; - static void get_params(struct task_struct *p, struct sched_attr *attr) + #ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +@@ -2195,6 +2200,7 @@ extern const u32 sched_prio_to_wmult[40]; + #else + #define ENQUEUE_MIGRATED 0x00 + #endif ++#define ENQUEUE_INITIAL 0x80 + + #define RETRY_TASK ((void *)-1UL) + +@@ -2217,6 +2223,7 @@ struct sched_class { + + void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); + ++ bool (*eligible_task)(struct rq *rq, struct task_struct *p); + struct task_struct *(*pick_next_task)(struct rq *rq); + + void (*put_prev_task)(struct rq *rq, struct task_struct *p); +@@ -2270,7 +2277,7 @@ struct sched_class { + + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { -- if (task_has_dl_policy(p)) -+ if (task_has_dl_policy(p)) { - __getparam_dl(p, attr); -- else if (task_has_rt_policy(p)) -+ } else if (task_has_rt_policy(p)) { - attr->sched_priority = p->rt_priority; -- else -+ } else { - attr->sched_nice = task_nice(p); -+ attr->sched_runtime = p->se.slice; -+ } +- WARN_ON_ONCE(rq->curr != prev); ++// WARN_ON_ONCE(rq->curr != prev); + prev->sched_class->put_prev_task(rq, prev); } - /** --- -cgit +@@ -2499,11 +2506,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + extern const_debug unsigned int sysctl_sched_nr_migrate; + extern const_debug unsigned int sysctl_sched_migration_cost; -From 2f88c8e802c8b128a155976631f4eb2ce4f3c805 Mon Sep 17 00:00:00 2001 -From: Shrikanth Hegde -Date: Thu, 24 Aug 2023 13:33:42 +0530 -Subject: sched/eevdf/doc: Modify the documented knob to base_slice_ns as well - -After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns' -sysctl to 'base_slice_ns': - - e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice") - -... but we forgot to rename it in the documentation. Do that now. - -Fixes: e4ec3318a17f ("sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice") -Signed-off-by: Shrikanth Hegde -Signed-off-by: Ingo Molnar -Cc: Peter Zijlstra -Link: https://lore.kernel.org/r/20230824080342.543396-1-sshegde@linux.vnet.ibm.com ---- - Documentation/scheduler/sched-design-CFS.rst | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst -index 03db555045151..f68919800f050 100644 ---- a/Documentation/scheduler/sched-design-CFS.rst -+++ b/Documentation/scheduler/sched-design-CFS.rst -@@ -94,7 +94,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the - way the previous scheduler had, and has no heuristics whatsoever. There is - only one central tunable (you have to switch on CONFIG_SCHED_DEBUG): - -- /sys/kernel/debug/sched/min_granularity_ns -+ /sys/kernel/debug/sched/base_slice_ns - - which can be used to tune the scheduler from "desktop" (i.e., low latencies) to - "server" (i.e., good batching) workloads. It defaults to a setting suitable --- -cgit - -From 63304558ba5dcaaff9e052ee43cfdcc7f9c29e85 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Wed, 16 Aug 2023 15:40:59 +0200 -Subject: sched/eevdf: Curb wakeup-preemption - -Mike and others noticed that EEVDF does like to over-schedule quite a -bit -- which does hurt performance of a number of benchmarks / -workloads. - -In particular, what seems to cause over-scheduling is that when lag is -of the same order (or larger) than the request / slice then placement -will not only cause the task to be placed left of current, but also -with a smaller deadline than current, which causes immediate -preemption. - -[ notably, lag bounds are relative to HZ ] - -Mike suggested we stick to picking 'current' for as long as it's -eligible to run, giving it uninterrupted runtime until it reaches -parity with the pack. - -Augment Mike's suggestion by only allowing it to exhaust it's initial -request. - -One random data point: - -echo NO_RUN_TO_PARITY > /debug/sched/features -perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000 - - 3,723,554 context-switches ( +- 0.56% ) - 9.5136 +- 0.0394 seconds time elapsed ( +- 0.41% ) - -echo RUN_TO_PARITY > /debug/sched/features -perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000 - - 2,556,535 context-switches ( +- 0.51% ) - 9.2427 +- 0.0302 seconds time elapsed ( +- 0.33% ) - -Suggested-by: Mike Galbraith -Signed-off-by: Peter Zijlstra (Intel) -Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net ---- - kernel/sched/fair.c | 12 ++++++++++++ - kernel/sched/features.h | 1 + - 2 files changed, 13 insertions(+) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index f496cef90ce77..0b7445cd5af98 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -873,6 +873,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) - if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) - curr = NULL; - -+ /* -+ * Once selected, run a task until it either becomes non-eligible or -+ * until it gets a new slice. See the HACK in set_next_entity(). -+ */ -+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) -+ return curr; ++extern unsigned int sysctl_sched_base_slice; + - while (node) { - struct sched_entity *se = __node_2_se(node); + #ifdef CONFIG_SCHED_DEBUG +-extern unsigned int sysctl_sched_latency; +-extern unsigned int sysctl_sched_min_granularity; +-extern unsigned int sysctl_sched_idle_min_granularity; +-extern unsigned int sysctl_sched_wakeup_granularity; + extern int sysctl_resched_latency_warn_ms; + extern int sysctl_resched_latency_warn_once; -@@ -5167,6 +5174,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - update_stats_wait_end_fair(cfs_rq, se); - __dequeue_entity(cfs_rq, se); - update_load_avg(cfs_rq, se, UPDATE_TG); -+ /* -+ * HACK, stash a copy of deadline at the point of pick in vlag, -+ * which isn't used until dequeue. -+ */ -+ se->vlag = se->deadline; - } +@@ -2516,6 +2521,8 @@ extern unsigned int sysctl_numa_balancing_scan_size; + extern unsigned int sysctl_numa_balancing_hot_threshold; + #endif - update_stats_curr_start(cfs_rq, se); -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 61bcbf5e46a45..f770168230ae4 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -6,6 +6,7 @@ - */ - SCHED_FEAT(PLACE_LAG, true) - SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) -+SCHED_FEAT(RUN_TO_PARITY, true) ++extern void set_latency_fair(struct sched_entity *se, int prio); ++ + #ifdef CONFIG_SCHED_HRTICK /* - * Prefer to schedule the task we woke last (assuming it failed --- -cgit +@@ -3480,4 +3487,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } + static inline void init_sched_mm_cid(struct task_struct *t) { } + #endif + ++extern u64 avg_vruntime(struct cfs_rq *cfs_rq); ++extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); ++ + #endif /* _KERNEL_SCHED_SCHED_H */ +diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h +index 3bac0a8ce..b2e932c25 100644 +--- a/tools/include/uapi/linux/sched.h ++++ b/tools/include/uapi/linux/sched.h +@@ -132,6 +132,7 @@ struct clone_args { + #define SCHED_FLAG_KEEP_PARAMS 0x10 + #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 + #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 ++#define SCHED_FLAG_LATENCY_NICE 0x80 + + #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ + SCHED_FLAG_KEEP_PARAMS) +@@ -143,6 +144,7 @@ struct clone_args { + SCHED_FLAG_RECLAIM | \ + SCHED_FLAG_DL_OVERRUN | \ + SCHED_FLAG_KEEP_ALL | \ +- SCHED_FLAG_UTIL_CLAMP) ++ SCHED_FLAG_UTIL_CLAMP | \ ++ SCHED_FLAG_LATENCY_NICE) + + #endif /* _UAPI_LINUX_SCHED_H */ +-- +2.42.0