From ccbf7e5a82f8065cac9e13179af37bb7d6f3035f Mon Sep 17 00:00:00 2001 From: ptr1337 Date: Thu, 23 Feb 2023 13:01:29 +0100 Subject: [PATCH] 6.2: Update prjc patchset (#710) Signed-off-by: Peter Jung --- linux-tkg-patches/6.2/0009-prjc_v6.2-r0.patch | 447 +++++++++++++----- 1 file changed, 320 insertions(+), 127 deletions(-) diff --git a/linux-tkg-patches/6.2/0009-prjc_v6.2-r0.patch b/linux-tkg-patches/6.2/0009-prjc_v6.2-r0.patch index cd093f8..c3304d2 100644 --- a/linux-tkg-patches/6.2/0009-prjc_v6.2-r0.patch +++ b/linux-tkg-patches/6.2/0009-prjc_v6.2-r0.patch @@ -1,8 +1,62 @@ +From e44ef62b127f6a161a131c84db92a7527d8fc72d Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Wed, 22 Feb 2023 19:24:36 +0100 +Subject: [PATCH] prjc + +Signed-off-by: Peter Jung +--- + .../admin-guide/kernel-parameters.txt | 6 + + Documentation/admin-guide/sysctl/kernel.rst | 10 + + Documentation/scheduler/sched-BMQ.txt | 110 + + fs/proc/base.c | 2 +- + include/asm-generic/resource.h | 2 +- + include/linux/sched.h | 33 +- + include/linux/sched/deadline.h | 20 + + include/linux/sched/prio.h | 26 + + include/linux/sched/rt.h | 2 + + include/linux/sched/topology.h | 3 +- + init/Kconfig | 34 + + init/init_task.c | 18 + + kernel/Kconfig.preempt | 2 +- + kernel/cgroup/cpuset.c | 4 +- + kernel/delayacct.c | 2 +- + kernel/exit.c | 4 +- + kernel/locking/rtmutex.c | 16 +- + kernel/sched/Makefile | 5 + + kernel/sched/alt_core.c | 8111 +++++++++++++++++ + kernel/sched/alt_debug.c | 31 + + kernel/sched/alt_sched.h | 671 ++ + kernel/sched/bmq.h | 110 + + kernel/sched/build_policy.c | 8 +- + kernel/sched/build_utility.c | 2 + + kernel/sched/cpufreq_schedutil.c | 10 + + kernel/sched/cputime.c | 10 +- + kernel/sched/debug.c | 10 + + kernel/sched/idle.c | 2 + + kernel/sched/pds.h | 127 + + kernel/sched/pelt.c | 4 +- + kernel/sched/pelt.h | 8 +- + kernel/sched/sched.h | 9 + + kernel/sched/stats.c | 4 + + kernel/sched/stats.h | 2 + + kernel/sched/topology.c | 17 + + kernel/sysctl.c | 15 + + kernel/time/hrtimer.c | 2 + + kernel/time/posix-cpu-timers.c | 10 +- + kernel/trace/trace_selftest.c | 5 + + 39 files changed, 9445 insertions(+), 22 deletions(-) + create mode 100644 Documentation/scheduler/sched-BMQ.txt + create mode 100644 kernel/sched/alt_core.c + create mode 100644 kernel/sched/alt_debug.c + create mode 100644 kernel/sched/alt_sched.h + create mode 100644 kernel/sched/bmq.h + create mode 100644 kernel/sched/pds.h + diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 42af9ca0127e..31747ec54f9d 100644 +index 6cfa6e3996cf..1b6a407213da 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -5406,6 +5406,12 @@ +@@ -5437,6 +5437,12 @@ sa1100ir [NET] See drivers/net/irda/sa1100_ir.c. @@ -16,10 +70,10 @@ index 42af9ca0127e..31747ec54f9d 100644 schedstats= [KNL,X86] Enable or disable scheduled statistics. diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index 98d1b198b2b4..d7c78a107f93 100644 +index 46e3d62c0eea..fb4568c919d0 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -1552,3 +1552,13 @@ is 10 seconds. +@@ -1597,3 +1597,13 @@ is 10 seconds. The softlockup threshold is (``2 * watchdog_thresh``). Setting this tunable to zero will disable lockup detection altogether. @@ -176,7 +230,7 @@ index 8874f681b056..59eb72bf7d5f 100644 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ } diff --git a/include/linux/sched.h b/include/linux/sched.h -index ffb6eb55cd13..2e730a59caa2 100644 +index 853d08f7562b..ad7e050d7455 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -762,8 +762,14 @@ struct task_struct { @@ -232,7 +286,7 @@ index ffb6eb55cd13..2e730a59caa2 100644 #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; -@@ -1545,6 +1567,15 @@ struct task_struct { +@@ -1539,6 +1561,15 @@ struct task_struct { */ }; @@ -352,10 +406,10 @@ index 816df6cc444e..c8da08e18c91 100644 #else static inline void rebuild_sched_domains_energy(void) diff --git a/init/Kconfig b/init/Kconfig -index 94125d3b6893..c87ba766d354 100644 +index 44e90b28a30f..af24591984ab 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -819,6 +819,7 @@ menu "Scheduler features" +@@ -821,6 +821,7 @@ menu "Scheduler features" config UCLAMP_TASK bool "Enable utilization clamping for RT/FAIR tasks" depends on CPU_FREQ_GOV_SCHEDUTIL @@ -363,7 +417,7 @@ index 94125d3b6893..c87ba766d354 100644 help This feature enables the scheduler to track the clamped utilization of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -865,6 +866,35 @@ config UCLAMP_BUCKETS_COUNT +@@ -867,6 +868,35 @@ config UCLAMP_BUCKETS_COUNT If in doubt, use the default value. @@ -399,7 +453,7 @@ index 94125d3b6893..c87ba766d354 100644 endmenu # -@@ -918,6 +948,7 @@ config NUMA_BALANCING +@@ -924,6 +954,7 @@ config NUMA_BALANCING depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY depends on SMP && NUMA && MIGRATION && !PREEMPT_RT @@ -407,7 +461,7 @@ index 94125d3b6893..c87ba766d354 100644 help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when -@@ -1015,6 +1046,7 @@ config FAIR_GROUP_SCHED +@@ -1021,6 +1052,7 @@ config FAIR_GROUP_SCHED depends on CGROUP_SCHED default CGROUP_SCHED @@ -415,7 +469,7 @@ index 94125d3b6893..c87ba766d354 100644 config CFS_BANDWIDTH bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" depends on FAIR_GROUP_SCHED -@@ -1037,6 +1069,7 @@ config RT_GROUP_SCHED +@@ -1043,6 +1075,7 @@ config RT_GROUP_SCHED realtime bandwidth for them. See Documentation/scheduler/sched-rt-group.rst for more information. @@ -423,7 +477,7 @@ index 94125d3b6893..c87ba766d354 100644 endif #CGROUP_SCHED config UCLAMP_TASK_GROUP -@@ -1281,6 +1314,7 @@ config CHECKPOINT_RESTORE +@@ -1287,6 +1320,7 @@ config CHECKPOINT_RESTORE config SCHED_AUTOGROUP bool "Automatic process group scheduling" @@ -491,10 +545,10 @@ index c2f1fd95a821..41654679b1b2 100644 This option permits Core Scheduling, a means of coordinated task selection across SMT siblings. When enabled -- see diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index b474289c15b8..a23224b45b03 100644 +index ca826bd1eba3..60e194f1d6d8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c -@@ -787,7 +787,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) +@@ -791,7 +791,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) return ret; } @@ -503,7 +557,7 @@ index b474289c15b8..a23224b45b03 100644 /* * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1183,7 +1183,7 @@ static void rebuild_sched_domains_locked(void) +@@ -1187,7 +1187,7 @@ static void rebuild_sched_domains_locked(void) /* Have scheduler rebuild the domains */ partition_and_rebuild_sched_domains(ndoms, doms, attr); } @@ -526,10 +580,10 @@ index e39cb696cfbd..463423572e09 100644 d->cpu_count += t1; diff --git a/kernel/exit.c b/kernel/exit.c -index 35e0a31a0315..64e368441cf4 100644 +index 15dc2ec80c46..1e583e0f89a7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c -@@ -125,7 +125,7 @@ static void __exit_signal(struct task_struct *tsk) +@@ -172,7 +172,7 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } @@ -538,7 +592,7 @@ index 35e0a31a0315..64e368441cf4 100644 sizeof(unsigned long long)); /* -@@ -146,7 +146,7 @@ static void __exit_signal(struct task_struct *tsk) +@@ -193,7 +193,7 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); @@ -548,10 +602,10 @@ index 35e0a31a0315..64e368441cf4 100644 __unhash_process(tsk, group_dead); write_sequnlock(&sig->stats_lock); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index 7779ee8abc2a..5b9893cdfb1b 100644 +index 728f434de2bb..0e1082a4e878 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c -@@ -300,21 +300,25 @@ static __always_inline void +@@ -337,21 +337,25 @@ static __always_inline void waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) { waiter->prio = __waiter_prio(task); @@ -579,7 +633,7 @@ index 7779ee8abc2a..5b9893cdfb1b 100644 /* * If both waiters have dl_prio(), we check the deadlines of the * associated tasks. -@@ -323,16 +327,22 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -360,16 +364,22 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, */ if (dl_prio(left->prio)) return dl_time_before(left->deadline, right->deadline); @@ -602,7 +656,7 @@ index 7779ee8abc2a..5b9893cdfb1b 100644 /* * If both waiters have dl_prio(), we check the deadlines of the * associated tasks. -@@ -341,8 +351,10 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, +@@ -378,8 +388,10 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, */ if (dl_prio(left->prio)) return left->deadline == right->deadline; @@ -632,10 +686,10 @@ index 976092b7bd45..31d587c16ec1 100644 obj-y += build_utility.o diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c new file mode 100644 -index 000000000000..acb8657e811d +index 000000000000..f5e9c01f9382 --- /dev/null +++ b/kernel/sched/alt_core.c -@@ -0,0 +1,7978 @@ +@@ -0,0 +1,8111 @@ +/* + * kernel/sched/alt_core.c + * @@ -705,7 +759,7 @@ index 000000000000..acb8657e811d +#define sched_feat(x) (0) +#endif /* CONFIG_SCHED_DEBUG */ + -+#define ALT_SCHED_VERSION "v6.1-r3" ++#define ALT_SCHED_VERSION "v6.2-r0" + +/* rt_prio(prio) defined in include/linux/sched/rt.h */ +#define rt_task(p) rt_prio((p)->prio) @@ -726,6 +780,12 @@ index 000000000000..acb8657e811d +#include "pds.h" +#endif + ++struct affinity_context { ++ const struct cpumask *new_mask; ++ struct cpumask *user_mask; ++ unsigned int flags; ++}; ++ +static int __init sched_timeslice(char *str) +{ + int timeslice_ms; @@ -788,6 +848,14 @@ index 000000000000..acb8657e811d +static cpumask_t sched_preempt_mask[SCHED_QUEUE_BITS] ____cacheline_aligned_in_smp; +static cpumask_t *const sched_idle_mask = &sched_preempt_mask[0]; + ++/* task function */ ++static inline const struct cpumask *task_user_cpus(struct task_struct *p) ++{ ++ if (!p->user_cpus_ptr) ++ return cpu_possible_mask; /* &init_task.cpus_mask */ ++ return p->user_cpus_ptr; ++} ++ +/* sched_queue related functions */ +static inline void sched_queue_init(struct sched_queue *q) +{ @@ -1400,7 +1468,7 @@ index 000000000000..acb8657e811d + +#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ + sched_info_enqueue(rq, p); \ -+ psi_enqueue(p, flags); \ ++ psi_enqueue(p, flags & ENQUEUE_WAKEUP); \ + \ + p->sq_idx = task_sched_prio_idx(p, rq); \ + list_add_tail(&p->sq_node, &rq->queue.heads[p->sq_idx]); \ @@ -2268,35 +2336,101 @@ index 000000000000..acb8657e811d +} + +static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx) +{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); ++ cpumask_copy(&p->cpus_mask, ctx->new_mask); ++ p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); ++ ++ /* ++ * Swap in a new user_cpus_ptr if SCA_USER flag set ++ */ ++ if (ctx->flags & SCA_USER) ++ swap(p->user_cpus_ptr, ctx->user_mask); +} + +static void -+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) +{ + lockdep_assert_held(&p->pi_lock); -+ set_cpus_allowed_common(p, new_mask); ++ set_cpus_allowed_common(p, ctx); +} + ++/* ++ * Used for kthread_bind() and select_fallback_rq(), in both cases the user ++ * affinity (if any) should be destroyed too. ++ */ +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ -+ __do_set_cpus_allowed(p, new_mask); ++ struct affinity_context ac = { ++ .new_mask = new_mask, ++ .user_mask = NULL, ++ .flags = SCA_USER, /* clear the user requested mask */ ++ }; ++ union cpumask_rcuhead { ++ cpumask_t cpumask; ++ struct rcu_head rcu; ++ }; ++ ++ __do_set_cpus_allowed(p, &ac); ++ ++ /* ++ * Because this is called with p->pi_lock held, it is not possible ++ * to use kfree() here (when PREEMPT_RT=y), therefore punt to using ++ * kfree_rcu(). ++ */ ++ kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu); ++} ++ ++static cpumask_t *alloc_user_cpus_ptr(int node) ++{ ++ /* ++ * See do_set_cpus_allowed() above for the rcu_head usage. ++ */ ++ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); ++ ++ return kmalloc_node(size, GFP_KERNEL, node); +} + +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, + int node) +{ -+ if (!src->user_cpus_ptr) ++ cpumask_t *user_mask; ++ unsigned long flags; ++ ++ /* ++ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's ++ * may differ by now due to racing. ++ */ ++ dst->user_cpus_ptr = NULL; ++ ++ /* ++ * This check is racy and losing the race is a valid situation. ++ * It is not worth the extra overhead of taking the pi_lock on ++ * every fork/clone. ++ */ ++ if (data_race(!src->user_cpus_ptr)) + return 0; + -+ dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); -+ if (!dst->user_cpus_ptr) ++ user_mask = alloc_user_cpus_ptr(node); ++ if (!user_mask) + return -ENOMEM; + -+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); ++ /* ++ * Use pi_lock to protect content of user_cpus_ptr ++ * ++ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent ++ * do_set_cpus_allowed(). ++ */ ++ raw_spin_lock_irqsave(&src->pi_lock, flags); ++ if (src->user_cpus_ptr) { ++ swap(dst->user_cpus_ptr, user_mask); ++ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); ++ } ++ raw_spin_unlock_irqrestore(&src->pi_lock, flags); ++ ++ if (unlikely(user_mask)) ++ kfree(user_mask); ++ + return 0; +} + @@ -2641,6 +2775,8 @@ index 000000000000..acb8657e811d + +static int affine_move_task(struct rq *rq, struct task_struct *p, int dest_cpu, + raw_spinlock_t *lock, unsigned long irq_flags) ++ __releases(rq->lock) ++ __releases(p->pi_lock) +{ + /* Can the task run on the task's current CPU? If so, we're done */ + if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { @@ -2678,8 +2814,7 @@ index 000000000000..acb8657e811d +} + +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, -+ const struct cpumask *new_mask, -+ u32 flags, ++ struct affinity_context *ctx, + struct rq *rq, + raw_spinlock_t *lock, + unsigned long irq_flags) @@ -2687,7 +2822,6 @@ index 000000000000..acb8657e811d + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); + const struct cpumask *cpu_valid_mask = cpu_active_mask; + bool kthread = p->flags & PF_KTHREAD; -+ struct cpumask *user_mask = NULL; + int dest_cpu; + int ret = 0; + @@ -2705,7 +2839,7 @@ index 000000000000..acb8657e811d + cpu_valid_mask = cpu_online_mask; + } + -+ if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { ++ if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) { + ret = -EINVAL; + goto out; + } @@ -2714,30 +2848,23 @@ index 000000000000..acb8657e811d + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ -+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ++ if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + -+ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) + goto out; + -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ dest_cpu = cpumask_any_and(cpu_valid_mask, ctx->new_mask); + if (dest_cpu >= nr_cpu_ids) { + ret = -EINVAL; + goto out; + } + -+ __do_set_cpus_allowed(p, new_mask); ++ __do_set_cpus_allowed(p, ctx); + -+ if (flags & SCA_USER) -+ user_mask = clear_user_cpus_ptr(p); -+ -+ ret = affine_move_task(rq, p, dest_cpu, lock, irq_flags); -+ -+ kfree(user_mask); -+ -+ return ret; ++ return affine_move_task(rq, p, dest_cpu, lock, irq_flags); + +out: + __task_access_unlock(p, lock); @@ -2748,7 +2875,6 @@ index 000000000000..acb8657e811d + +/* + * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the @@ -2756,7 +2882,7 @@ index 000000000000..acb8657e811d + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, u32 flags) ++ struct affinity_context *ctx) +{ + unsigned long irq_flags; + struct rq *rq; @@ -2764,20 +2890,36 @@ index 000000000000..acb8657e811d + + raw_spin_lock_irqsave(&p->pi_lock, irq_flags); + rq = __task_access_lock(p, &lock); ++ /* ++ * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_* ++ * flags are set. ++ */ ++ if (p->user_cpus_ptr && ++ !(ctx->flags & SCA_USER) && ++ cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr)) ++ ctx->new_mask = rq->scratch_mask; + -+ return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, lock, irq_flags); ++ ++ return __set_cpus_allowed_ptr_locked(p, ctx, rq, lock, irq_flags); +} + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ -+ return __set_cpus_allowed_ptr(p, new_mask, 0); ++ struct affinity_context ac = { ++ .new_mask = new_mask, ++ .flags = 0, ++ }; ++ ++ return __set_cpus_allowed_ptr(p, &ac); +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +/* + * Change a given task's CPU affinity to the intersection of its current -+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask -+ * and pointing @p->user_cpus_ptr to a copy of the old mask. ++ * affinity mask and @subset_mask, writing the resulting mask to @new_mask. ++ * If user_cpus_ptr is defined, use it as the basis for restricting CPU ++ * affinity or use cpu_online_mask instead. ++ * + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ @@ -2785,48 +2927,34 @@ index 000000000000..acb8657e811d + struct cpumask *new_mask, + const struct cpumask *subset_mask) +{ -+ struct cpumask *user_mask = NULL; ++ struct affinity_context ac = { ++ .new_mask = new_mask, ++ .flags = 0, ++ }; + unsigned long irq_flags; + raw_spinlock_t *lock; + struct rq *rq; + int err; + -+ if (!p->user_cpus_ptr) { -+ user_mask = kmalloc(cpumask_size(), GFP_KERNEL); -+ if (!user_mask) -+ return -ENOMEM; -+ } -+ + raw_spin_lock_irqsave(&p->pi_lock, irq_flags); + rq = __task_access_lock(p, &lock); + -+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { ++ if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) { + err = -EINVAL; + goto err_unlock; + } + -+ /* -+ * We're about to butcher the task affinity, so keep track of what -+ * the user asked for in case we're able to restore it later on. -+ */ -+ if (user_mask) { -+ cpumask_copy(user_mask, p->cpus_ptr); -+ p->user_cpus_ptr = user_mask; -+ } -+ -+ /*return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);*/ -+ return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, lock, irq_flags); ++ return __set_cpus_allowed_ptr_locked(p, &ac, rq, lock, irq_flags); + +err_unlock: + __task_access_unlock(p, lock); + raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); -+ kfree(user_mask); + return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of -+ * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the ++ * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ @@ -2870,34 +2998,29 @@ index 000000000000..acb8657e811d +} + +static int -+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); ++__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); + +/* + * Restore the affinity of a task @p which was previously restricted by a -+ * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) -+ * @p->user_cpus_ptr. ++ * call to force_compatible_cpus_allowed_ptr(). + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ -+ struct cpumask *user_mask = p->user_cpus_ptr; -+ unsigned long flags; ++ struct affinity_context ac = { ++ .new_mask = task_user_cpus(p), ++ .flags = 0, ++ }; ++ int ret; + + /* -+ * Try to restore the old affinity mask. If this fails, then -+ * we free the mask explicitly to avoid it being inherited across -+ * a subsequent fork(). ++ * Try to restore the old affinity mask with __sched_setaffinity(). ++ * Cpuset masking will be done there too. + */ -+ if (!user_mask || !__sched_setaffinity(p, user_mask)) -+ return; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ user_mask = clear_user_cpus_ptr(p); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ kfree(user_mask); ++ ret = __sched_setaffinity(p, &ac); ++ WARN_ON_ONCE(ret); +} + +#else /* CONFIG_SMP */ @@ -2909,9 +3032,9 @@ index 000000000000..acb8657e811d + +static inline int +__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, u32 flags) ++ struct affinity_context *ctx) +{ -+ return set_cpus_allowed_ptr(p, new_mask); ++ return set_cpus_allowed_ptr(p, ctx->new_mask); +} + +static inline bool rq_has_pinned_tasks(struct rq *rq) @@ -2919,6 +3042,11 @@ index 000000000000..acb8657e811d + return false; +} + ++static inline cpumask_t *alloc_user_cpus_ptr(int node) ++{ ++ return NULL; ++} ++ +#endif /* !CONFIG_SMP */ + +static void @@ -3030,13 +3158,6 @@ index 000000000000..acb8657e811d + if (!llist) + return; + -+ /* -+ * rq::ttwu_pending racy indication of out-standing wakeups. -+ * Races such that false-negatives are possible, since they -+ * are shorter lived that false-positives would be. -+ */ -+ WRITE_ONCE(rq->ttwu_pending, 0); -+ + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + @@ -3050,6 +3171,17 @@ index 000000000000..acb8657e811d + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); + } + ++ /* ++ * Must be after enqueueing at least once task such that ++ * idle_cpu() does not observe a false-negative -- if it does, ++ * it is possible for select_idle_siblings() to stack a number ++ * of tasks on this CPU during that window. ++ * ++ * It is ok to clear ttwu_pending when another task pending. ++ * We will receive IPI after local irq enabled and then enqueue it. ++ * Since now nr_running > 0, idle_cpu() will always get correct result. ++ */ ++ WRITE_ONCE(rq->ttwu_pending, 0); + rq_unlock_irqrestore(rq, &rf); +} + @@ -4635,7 +4767,9 @@ index 000000000000..acb8657e811d + struct rq *rq = cpu_rq(cpu); + u64 resched_latency; + -+ arch_scale_freq_tick(); ++ if (housekeeping_cpu(cpu, HK_TYPE_TICK)) ++ arch_scale_freq_tick(); ++ + sched_clock_tick(); + + raw_spin_lock(&rq->lock); @@ -4734,7 +4868,7 @@ index 000000000000..acb8657e811d + int i; + + for_each_cpu_wrap(i, &chk, cpu) { -+ if (cpumask_subset(cpu_smt_mask(i), &chk) && ++ if (!cpumask_intersects(cpu_smt_mask(i), sched_idle_mask) &&\ + sg_balance_trigger(i)) + return; + } @@ -4857,6 +4991,7 @@ index 000000000000..acb8657e811d +static void sched_tick_stop(int cpu) +{ + struct tick_work *twork; ++ int os; + + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + return; @@ -4864,7 +4999,10 @@ index 000000000000..acb8657e811d + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); ++ /* There cannot be competing actions, but don't rely on stop-machine. */ ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); ++ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); ++ /* Don't cancel, as this would mess up the state machine. */ +} +#endif /* CONFIG_HOTPLUG_CPU */ + @@ -4988,8 +5126,7 @@ index 000000000000..acb8657e811d + pr_err("Preemption disabled at:"); + print_ip_sym(KERN_ERR, preempt_disable_ip); + } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); ++ check_panic_on_warn("scheduling while atomic"); + + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); @@ -5305,7 +5442,7 @@ index 000000000000..acb8657e811d + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && -+ !(prev->flags & TASK_FROZEN); ++ !(prev_state & TASK_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; @@ -6653,7 +6790,7 @@ index 000000000000..acb8657e811d +#endif + +static int -+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) ++__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) +{ + int retval; + cpumask_var_t cpus_allowed, new_mask; @@ -6667,9 +6804,12 @@ index 000000000000..acb8657e811d + } + + cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); ++ cpumask_and(new_mask, ctx->new_mask, cpus_allowed); ++ ++ ctx->new_mask = new_mask; ++ ctx->flags |= SCA_CHECK; ++ ++ retval = __set_cpus_allowed_ptr(p, ctx); + if (retval) + goto out_free_new_mask; + @@ -6681,7 +6821,24 @@ index 000000000000..acb8657e811d + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); -+ goto again; ++ ++ /* ++ * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr() ++ * will restore the previous user_cpus_ptr value. ++ * ++ * In the unlikely event a previous user_cpus_ptr exists, ++ * we need to further restrict the mask to what is allowed ++ * by that old user_cpus_ptr. ++ */ ++ if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) { ++ bool empty = !cpumask_and(new_mask, new_mask, ++ ctx->user_mask); ++ ++ if (WARN_ON_ONCE(empty)) ++ cpumask_copy(new_mask, cpus_allowed); ++ } ++ __set_cpus_allowed_ptr(p, ctx); ++ retval = -EINVAL; + } + +out_free_new_mask: @@ -6693,6 +6850,8 @@ index 000000000000..acb8657e811d + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ ++ struct affinity_context ac; ++ struct cpumask *user_mask; + struct task_struct *p; + int retval; + @@ -6727,7 +6886,27 @@ index 000000000000..acb8657e811d + if (retval) + goto out_put_task; + -+ retval = __sched_setaffinity(p, in_mask); ++ /* ++ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and ++ * alloc_user_cpus_ptr() returns NULL. ++ */ ++ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); ++ if (user_mask) { ++ cpumask_copy(user_mask, in_mask); ++ } else if (IS_ENABLED(CONFIG_SMP)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ ++ ac = (struct affinity_context){ ++ .new_mask = in_mask, ++ .user_mask = user_mask, ++ .flags = SCA_USER, ++ }; ++ ++ retval = __sched_setaffinity(p, &ac); ++ kfree(ac.user_mask); ++ +out_put_task: + put_task_struct(p); + return retval; @@ -7483,6 +7662,12 @@ index 000000000000..acb8657e811d + */ +void __init init_idle(struct task_struct *idle, int cpu) +{ ++#ifdef CONFIG_SMP ++ struct affinity_context ac = (struct affinity_context) { ++ .new_mask = cpumask_of(cpu), ++ .flags = 0, ++ }; ++#endif + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + @@ -7509,7 +7694,7 @@ index 000000000000..acb8657e811d + * + * And since this is boot we can forgo the serialisation. + */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++ set_cpus_allowed_common(idle, &ac); +#endif + + /* Silence PROVE_RCU */ @@ -8137,6 +8322,8 @@ index 000000000000..acb8657e811d + + hrtick_rq_init(rq); + atomic_set(&rq->nr_iowait, 0); ++ ++ zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); + } +#ifdef CONFIG_SMP + /* Set rq->online for cpu 0 */ @@ -8653,10 +8840,10 @@ index 000000000000..1212a031700e +{} diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h new file mode 100644 -index 000000000000..c32403ed82b6 +index 000000000000..0b563999d4c1 --- /dev/null +++ b/kernel/sched/alt_sched.h -@@ -0,0 +1,668 @@ +@@ -0,0 +1,671 @@ +#ifndef ALT_SCHED_H +#define ALT_SCHED_H + @@ -8903,6 +9090,9 @@ index 000000000000..c32403ed82b6 +#endif + atomic_t nohz_flags; +#endif /* CONFIG_NO_HZ_COMMON */ ++ ++ /* Scratch cpumask to be temporarily used under rq_lock */ ++ cpumask_var_t scratch_mask; +}; + +extern unsigned long rq_load_util(struct rq *rq, unsigned long max); @@ -9874,7 +10064,7 @@ index 3a0e0dc28721..e8a7d84aa5a5 100644 static inline int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index a4a20046e586..c363693cd869 100644 +index 771f8ddb7053..787a5069d69a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,10 @@ @@ -9888,7 +10078,7 @@ index a4a20046e586..c363693cd869 100644 #include #include #include -@@ -3183,4 +3187,9 @@ static inline void update_current_exec_runtime(struct task_struct *curr, +@@ -3261,4 +3265,9 @@ static inline void update_current_exec_runtime(struct task_struct *curr, cgroup_account_cputime(curr, delta_exec); } @@ -9930,7 +10120,7 @@ index 857f837f52cb..5486c63e4790 100644 } return 0; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h -index 84a188913cc9..53934e7ef5db 100644 +index 38f3698f5e5b..b9d597394316 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -89,6 +89,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt @@ -10005,7 +10195,7 @@ index 8739c2a5a54e..d8dd6c15eb47 100644 +#endif /* CONFIG_NUMA */ +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index c6d9dec11b74..2bc42ce8b48e 100644 +index 137d4abe3eda..6bada3a6d571 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -93,6 +93,10 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); @@ -10019,7 +10209,7 @@ index c6d9dec11b74..2bc42ce8b48e 100644 #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; #endif -@@ -1953,6 +1959,17 @@ static struct ctl_table kern_table[] = { +@@ -1934,6 +1938,17 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif @@ -10113,10 +10303,10 @@ index cb925e8ef9a8..67d823510f5c 100644 return false; } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index a2d301f58ced..2ccdede8585c 100644 +index ff0536cea968..ce266990006d 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c -@@ -1143,10 +1143,15 @@ static int trace_wakeup_test_thread(void *data) +@@ -1150,10 +1150,15 @@ static int trace_wakeup_test_thread(void *data) { /* Make this a -deadline thread */ static const struct sched_attr attr = { @@ -10132,3 +10322,6 @@ index a2d301f58ced..2ccdede8585c 100644 }; struct wakeup_test_data *x = data; +-- +2.39.2 +