Files
linux-tkg/linux-tkg-patches/5.13/0004-5.13-ck1.patch
2021-09-17 14:30:59 +02:00

14718 lines
439 KiB
Diff

From 512f9bcc181e532b200ee37678a8ab0186e6d821 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 25 Oct 2019 14:00:52 +1100
Subject: [PATCH 01/34] MultiQueue Skiplist Scheduler v0.210
---
.../admin-guide/kernel-parameters.txt | 8 +
Documentation/admin-guide/sysctl/kernel.rst | 34 +
Documentation/scheduler/sched-BFS.txt | 351 +
Documentation/scheduler/sched-MuQSS.txt | 373 +
arch/alpha/Kconfig | 2 +
arch/arm/Kconfig | 2 +
arch/arm64/Kconfig | 2 +
arch/powerpc/Kconfig | 2 +
arch/powerpc/platforms/cell/spufs/sched.c | 5 -
arch/x86/Kconfig | 18 +
drivers/cpufreq/Kconfig | 1 +
fs/proc/base.c | 2 +-
include/linux/init_task.h | 4 +
include/linux/ioprio.h | 2 +
include/linux/sched.h | 61 +-
include/linux/sched/deadline.h | 9 +
include/linux/sched/nohz.h | 2 +-
include/linux/sched/prio.h | 6 +
include/linux/sched/rt.h | 2 +
include/linux/sched/task.h | 2 +-
include/linux/skip_list.h | 33 +
include/uapi/linux/sched.h | 9 +-
init/Kconfig | 24 +-
init/init_task.c | 10 +
init/main.c | 2 +
kernel/Kconfig.MuQSS | 106 +
kernel/Makefile | 3 +-
kernel/delayacct.c | 2 +-
kernel/exit.c | 4 +-
kernel/kthread.c | 30 +-
kernel/livepatch/transition.c | 6 +-
kernel/sched/Makefile | 10 +-
kernel/sched/MuQSS.c | 8320 +++++++++++++++++
kernel/sched/MuQSS.h | 1076 +++
kernel/sched/cpufreq_schedutil.c | 4 +
kernel/sched/cpupri.h | 2 +
kernel/sched/cputime.c | 22 +-
kernel/sched/idle.c | 2 +
kernel/sched/sched.h | 37 +
kernel/sched/topology.c | 8 +
kernel/skip_list.c | 148 +
kernel/sysctl.c | 70 +-
kernel/time/Kconfig | 2 +-
kernel/time/clockevents.c | 5 +
kernel/time/posix-cpu-timers.c | 4 +-
kernel/time/timer.c | 7 +-
kernel/trace/trace_selftest.c | 5 +
scripts/headers_install.sh | 1 +
48 files changed, 10792 insertions(+), 48 deletions(-)
create mode 100644 Documentation/scheduler/sched-BFS.txt
create mode 100644 Documentation/scheduler/sched-MuQSS.txt
create mode 100644 include/linux/skip_list.h
create mode 100644 kernel/Kconfig.MuQSS
create mode 100644 kernel/sched/MuQSS.c
create mode 100644 kernel/sched/MuQSS.h
create mode 100644 kernel/skip_list.c
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index cb89dbdedc4638..b97bafd6c124a5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4862,6 +4862,14 @@
Memory area to be used by remote processor image,
managed by CMA.
+ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type.
+ Format: <string>
+ smt -- Share SMT (hyperthread) sibling runqueues
+ mc -- Share MC (multicore) sibling runqueues
+ smp -- Share SMP runqueues
+ none -- So not share any runqueues
+ Default value is mc
+
rw [KNL] Mount root device read-write on boot
S [KNL] Run init in single mode
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 68b21395a743f0..2881a2c4110784 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -436,6 +436,16 @@ this allows system administrators to override the
``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded.
+iso_cpu: (MuQSS CPU scheduler only)
+===================================
+
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
+run effectively at realtime priority, averaged over a rolling five
+seconds over the -whole- system, meaning all cpus.
+
+Set to 70 (percent) by default.
+
+
kexec_load_disabled
===================
@@ -1078,6 +1088,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after
rebooting. ???
+rr_interval: (MuQSS CPU scheduler only)
+=======================================
+
+This is the smallest duration that any cpu process scheduling unit
+will run for. Increasing this value can increase throughput of cpu
+bound tasks substantially but at the expense of increased latencies
+overall. Conversely decreasing it will decrease average and maximum
+latencies but at the expense of throughput. This value is in
+milliseconds and the default value chosen depends on the number of
+cpus available at scheduler initialisation with a minimum of 6.
+
+Valid values are from 1-1000.
+
+
sched_energy_aware
==================
@@ -1527,3 +1551,13 @@ is 10 seconds.
The softlockup threshold is (``2 * watchdog_thresh``). Setting this
tunable to zero will disable lockup detection altogether.
+
+
+yield_type: (MuQSS CPU scheduler only)
+======================================
+
+This determines what type of yield calls to sched_yield will perform.
+
+ 0: No yield.
+ 1: Yield only to better priority/deadline tasks. (default)
+ 2: Expire timeslice and recalculate deadline.
diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt
new file mode 100644
index 00000000000000..c0282002a07913
--- /dev/null
+++ b/Documentation/scheduler/sched-BFS.txt
@@ -0,0 +1,351 @@
+BFS - The Brain Fuck Scheduler by Con Kolivas.
+
+Goals.
+
+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to
+completely do away with the complex designs of the past for the cpu process
+scheduler and instead implement one that is very simple in basic design.
+The main focus of BFS is to achieve excellent desktop interactivity and
+responsiveness without heuristics and tuning knobs that are difficult to
+understand, impossible to model and predict the effect of, and when tuned to
+one workload cause massive detriment to another.
+
+
+Design summary.
+
+BFS is best described as a single runqueue, O(n) lookup, earliest effective
+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual
+deadline first) and my previous Staircase Deadline scheduler. Each component
+shall be described in order to understand the significance of, and reasoning for
+it. The codebase when the first stable version was released was approximately
+9000 lines less code than the existing mainline linux kernel scheduler (in
+2.6.31). This does not even take into account the removal of documentation and
+the cgroups code that is not used.
+
+Design reasoning.
+
+The single runqueue refers to the queued but not running processes for the
+entire system, regardless of the number of CPUs. The reason for going back to
+a single runqueue design is that once multiple runqueues are introduced,
+per-CPU or otherwise, there will be complex interactions as each runqueue will
+be responsible for the scheduling latency and fairness of the tasks only on its
+own runqueue, and to achieve fairness and low latency across multiple CPUs, any
+advantage in throughput of having CPU local tasks causes other disadvantages.
+This is due to requiring a very complex balancing system to at best achieve some
+semblance of fairness across CPUs and can only maintain relatively low latency
+for tasks bound to the same CPUs, not across them. To increase said fairness
+and latency across CPUs, the advantage of local runqueue locking, which makes
+for better scalability, is lost due to having to grab multiple locks.
+
+A significant feature of BFS is that all accounting is done purely based on CPU
+used and nowhere is sleep time used in any way to determine entitlement or
+interactivity. Interactivity "estimators" that use some kind of sleep/run
+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag
+tasks that aren't interactive as being so. The reason for this is that it is
+close to impossible to determine that when a task is sleeping, whether it is
+doing it voluntarily, as in a userspace application waiting for input in the
+form of a mouse click or otherwise, or involuntarily, because it is waiting for
+another thread, process, I/O, kernel activity or whatever. Thus, such an
+estimator will introduce corner cases, and more heuristics will be required to
+cope with those corner cases, introducing more corner cases and failed
+interactivity detection and so on. Interactivity in BFS is built into the design
+by virtue of the fact that tasks that are waking up have not used up their quota
+of CPU time, and have earlier effective deadlines, thereby making it very likely
+they will preempt any CPU bound task of equivalent nice level. See below for
+more information on the virtual deadline mechanism. Even if they do not preempt
+a running task, because the rr interval is guaranteed to have a bound upper
+limit on how long a task will wait for, it will be scheduled within a timeframe
+that will not cause visible interface jitter.
+
+
+Design details.
+
+Task insertion.
+
+BFS inserts tasks into each relevant queue as an O(1) insertion into a double
+linked list. On insertion, *every* running queue is checked to see if the newly
+queued task can run on any idle queue, or preempt the lowest running task on the
+system. This is how the cross-CPU scheduling of BFS achieves significantly lower
+latency per extra CPU the system has. In this case the lookup is, in the worst
+case scenario, O(n) where n is the number of CPUs on the system.
+
+Data protection.
+
+BFS has one single lock protecting the process local data of every task in the
+global queue. Thus every insertion, removal and modification of task data in the
+global runqueue needs to grab the global lock. However, once a task is taken by
+a CPU, the CPU has its own local data copy of the running process' accounting
+information which only that CPU accesses and modifies (such as during a
+timer tick) thus allowing the accounting data to be updated lockless. Once a
+CPU has taken a task to run, it removes it from the global queue. Thus the
+global queue only ever has, at most,
+
+ (number of tasks requesting cpu time) - (number of logical CPUs) + 1
+
+tasks in the global queue. This value is relevant for the time taken to look up
+tasks during scheduling. This will increase if many tasks with CPU affinity set
+in their policy to limit which CPUs they're allowed to run on if they outnumber
+the number of CPUs. The +1 is because when rescheduling a task, the CPU's
+currently running task is put back on the queue. Lookup will be described after
+the virtual deadline mechanism is explained.
+
+Virtual deadline.
+
+The key to achieving low latency, scheduling fairness, and "nice level"
+distribution in BFS is entirely in the virtual deadline mechanism. The one
+tunable in BFS is the rr_interval, or "round robin interval". This is the
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
+tasks of the same nice level will be running for, or looking at it the other
+way around, the longest duration two tasks of the same nice level will be
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
+equal to the rr_interval and a virtual deadline. The virtual deadline is
+offset from the current time in jiffies by this equation:
+
+ jiffies + (prio_ratio * rr_interval)
+
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
+and increases by 10% per nice level. The deadline is a virtual one only in that
+no guarantee is placed that a task will actually be scheduled by this time, but
+it is used to compare which task should go next. There are three components to
+how a task is next chosen. First is time_slice expiration. If a task runs out
+of its time_slice, it is descheduled, the time_slice is refilled, and the
+deadline reset to that formula above. Second is sleep, where a task no longer
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
+adjusted in this case and are just carried over for when the task is next
+scheduled. Third is preemption, and that is when a newly waking task is deemed
+higher priority than a currently running task on any cpu by virtue of the fact
+that it has an earlier virtual deadline than the currently running task. The
+earlier deadline is the key to which task is next chosen for the first and
+second cases. Once a task is descheduled, it is put back on the queue, and an
+O(n) lookup of all queued-but-not-running tasks is done to determine which has
+the earliest deadline and that task is chosen to receive CPU next.
+
+The CPU proportion of different nice tasks works out to be approximately the
+
+ (prio_ratio difference)^2
+
+The reason it is squared is that a task's deadline does not change while it is
+running unless it runs out of time_slice. Thus, even if the time actually
+passes the deadline of another task that is queued, it will not get CPU time
+unless the current running task deschedules, and the time "base" (jiffies) is
+constantly moving.
+
+Task lookup.
+
+BFS has 103 priority queues. 100 of these are dedicated to the static priority
+of realtime tasks, and the remaining 3 are, in order of best to worst priority,
+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority
+scheduling). When a task of these priorities is queued, a bitmap of running
+priorities is set showing which of these priorities has tasks waiting for CPU
+time. When a CPU is made to reschedule, the lookup for the next task to get
+CPU time is performed in the following way:
+
+First the bitmap is checked to see what static priority tasks are queued. If
+any realtime priorities are found, the corresponding queue is checked and the
+first task listed there is taken (provided CPU affinity is suitable) and lookup
+is complete. If the priority corresponds to a SCHED_ISO task, they are also
+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds
+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this
+stage, every task in the runlist that corresponds to that priority is checked
+to see which has the earliest set deadline, and (provided it has suitable CPU
+affinity) it is taken off the runqueue and given the CPU. If a task has an
+expired deadline, it is taken and the rest of the lookup aborted (as they are
+chosen in FIFO order).
+
+Thus, the lookup is O(n) in the worst case only, where n is as described
+earlier, as tasks may be chosen before the whole task list is looked over.
+
+
+Scalability.
+
+The major limitations of BFS will be that of scalability, as the separate
+runqueue designs will have less lock contention as the number of CPUs rises.
+However they do not scale linearly even with separate runqueues as multiple
+runqueues will need to be locked concurrently on such designs to be able to
+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness
+across CPUs, and to achieve low enough latency for tasks on a busy CPU when
+other CPUs would be more suited. BFS has the advantage that it requires no
+balancing algorithm whatsoever, as balancing occurs by proxy simply because
+all CPUs draw off the global runqueue, in priority and deadline order. Despite
+the fact that scalability is _not_ the prime concern of BFS, it both shows very
+good scalability to smaller numbers of CPUs and is likely a more scalable design
+at these numbers of CPUs.
+
+It also has some very low overhead scalability features built into the design
+when it has been deemed their overhead is so marginal that they're worth adding.
+The first is the local copy of the running process' data to the CPU it's running
+on to allow that data to be updated lockless where possible. Then there is
+deference paid to the last CPU a task was running on, by trying that CPU first
+when looking for an idle CPU to use the next time it's scheduled. Finally there
+is the notion of cache locality beyond the last running CPU. The sched_domains
+information is used to determine the relative virtual "cache distance" that
+other CPUs have from the last CPU a task was running on. CPUs with shared
+caches, such as SMT siblings, or multicore CPUs with shared caches, are treated
+as cache local. CPUs without shared caches are treated as not cache local, and
+CPUs on different NUMA nodes are treated as very distant. This "relative cache
+distance" is used by modifying the virtual deadline value when doing lookups.
+Effectively, the deadline is unaltered between "cache local" CPUs, doubled for
+"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning
+behind the doubling of deadlines is as follows. The real cost of migrating a
+task from one CPU to another is entirely dependant on the cache footprint of
+the task, how cache intensive the task is, how long it's been running on that
+CPU to take up the bulk of its cache, how big the CPU cache is, how fast and
+how layered the CPU cache is, how fast a context switch is... and so on. In
+other words, it's close to random in the real world where we do more than just
+one sole workload. The only thing we can be sure of is that it's not free. So
+BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs
+is more important than cache locality, and cache locality only plays a part
+after that. Doubling the effective deadline is based on the premise that the
+"cache local" CPUs will tend to work on the same tasks up to double the number
+of cache local CPUs, and once the workload is beyond that amount, it is likely
+that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA
+is a value I pulled out of my arse.
+
+When choosing an idle CPU for a waking task, the cache locality is determined
+according to where the task last ran and then idle CPUs are ranked from best
+to worst to choose the most suitable idle CPU based on cache locality, NUMA
+node locality and hyperthread sibling business. They are chosen in the
+following preference (if idle):
+
+* Same core, idle or busy cache, idle threads
+* Other core, same cache, idle or busy cache, idle threads.
+* Same node, other CPU, idle cache, idle threads.
+* Same node, other CPU, busy cache, idle threads.
+* Same core, busy threads.
+* Other core, same cache, busy threads.
+* Same node, other CPU, busy threads.
+* Other node, other CPU, idle cache, idle threads.
+* Other node, other CPU, busy cache, idle threads.
+* Other node, other CPU, busy threads.
+
+This shows the SMT or "hyperthread" awareness in the design as well which will
+choose a real idle core first before a logical SMT sibling which already has
+tasks on the physical CPU.
+
+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark.
+However this benchmarking was performed on an earlier design that was far less
+scalable than the current one so it's hard to know how scalable it is in terms
+of both CPUs (due to the global runqueue) and heavily loaded machines (due to
+O(n) lookup) at this stage. Note that in terms of scalability, the number of
+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x)
+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark
+results are very promising indeed, without needing to tweak any knobs, features
+or options. Benchmark contributions are most welcome.
+
+
+Features
+
+As the initial prime target audience for BFS was the average desktop user, it
+was designed to not need tweaking, tuning or have features set to obtain benefit
+from it. Thus the number of knobs and features has been kept to an absolute
+minimum and should not require extra user input for the vast majority of cases.
+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval
+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition
+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is
+support for CGROUPS. The average user should neither need to know what these
+are, nor should they need to be using them to have good desktop behaviour.
+
+rr_interval
+
+There is only one "scheduler" tunable, the round robin interval. This can be
+accessed in
+
+ /proc/sys/kernel/rr_interval
+
+The value is in milliseconds, and the default value is set to 6 on a
+uniprocessor machine, and automatically set to a progressively higher value on
+multiprocessor machines. The reasoning behind increasing the value on more CPUs
+is that the effective latency is decreased by virtue of there being more CPUs on
+BFS (for reasons explained above), and increasing the value allows for less
+cache contention and more throughput. Valid values are from 1 to 1000
+Decreasing the value will decrease latencies at the cost of decreasing
+throughput, while increasing it will improve throughput, but at the cost of
+worsening latencies. The accuracy of the rr interval is limited by HZ resolution
+of the kernel configuration. Thus, the worst case latencies are usually slightly
+higher than this actual value. The default value of 6 is not an arbitrary one.
+It is based on the fact that humans can detect jitter at approximately 7ms, so
+aiming for much lower latencies is pointless under most circumstances. It is
+worth noting this fact when comparing the latency performance of BFS to other
+schedulers. Worst case latencies being higher than 7ms are far worse than
+average latencies not being in the microsecond range.
+
+Isochronous scheduling.
+
+Isochronous scheduling is a unique scheduling policy designed to provide
+near-real-time performance to unprivileged (ie non-root) users without the
+ability to starve the machine indefinitely. Isochronous tasks (which means
+"same time") are set using, for example, the schedtool application like so:
+
+ schedtool -I -e amarok
+
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
+rate). However if ISO tasks run for more than a tunable finite amount of time,
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
+time is the percentage of _total CPU_ available across the machine, configurable
+as a percentage in the following "resource handling" tunable (as opposed to a
+scheduler tunable):
+
+ /proc/sys/kernel/iso_cpu
+
+and is set to 70% by default. It is calculated over a rolling 5 second average
+Because it is the total CPU available, it means that on a multi CPU machine, it
+is possible to have an ISO task running as realtime scheduling indefinitely on
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
+ability to run any pseudo-realtime tasks.
+
+A feature of BFS is that it detects when an application tries to obtain a
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
+appropriate privileges to use those policies. When it detects this, it will
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
+Because some applications constantly set their policy as well as their nice
+level, there is potential for them to undo the override specified by the user
+on the command line of setting the policy to SCHED_ISO. To counter this, once
+a task has been set to SCHED_ISO policy, it needs superuser privileges to set
+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child
+processes and threads will also inherit the ISO policy.
+
+Idleprio scheduling.
+
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
+ultra low priority tasks to be run in the background that have virtually no
+effect on the foreground tasks. This is ideally suited to distributed computing
+clients (like setiathome, folding, mprime etc) but can also be used to start
+a video encode or so on without any slowdown of other tasks. To avoid this
+policy from grabbing shared resources and holding them indefinitely, if it
+detects a state where the task is waiting on I/O, the machine is about to
+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As
+per the Isochronous task management, once a task has been scheduled as IDLEPRIO,
+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can
+be set to start as SCHED_IDLEPRIO with the schedtool command like so:
+
+ schedtool -D -e ./mprime
+
+Subtick accounting.
+
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
+the accounting is done by simply determining what is happening at the precise
+moment a timer tick fires off. This becomes increasingly inaccurate as the
+timer tick frequency (HZ) is lowered. It is possible to create an application
+which uses almost 100% CPU, yet by being descheduled at the right time, records
+zero CPU usage. While the main problem with this is that there are possible
+security implications, it is also difficult to determine how much CPU a task
+really does use. BFS tries to use the sub-tick accounting from the TSC clock,
+where possible, to determine real CPU usage. This is not entirely reliable, but
+is far more likely to produce accurate CPU usage data than the existing designs
+and will not show tasks as consuming no CPU usage when they actually are. Thus,
+the amount of CPU reported as being used by BFS will more accurately represent
+how much CPU the task itself is using (as is shown for example by the 'time'
+application), so the reported values may be quite different to other schedulers.
+Values reported as the 'load' are more prone to problems with this design, but
+per process values are closer to real usage. When comparing throughput of BFS
+to other designs, it is important to compare the actual completed work in terms
+of total wall clock time taken and total work done, rather than the reported
+"cpu usage".
+
+
+Con Kolivas <kernel@kolivas.org> Fri Aug 27 2010
diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt
new file mode 100644
index 00000000000000..ae28b85c999513
--- /dev/null
+++ b/Documentation/scheduler/sched-MuQSS.txt
@@ -0,0 +1,373 @@
+MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas.
+
+MuQSS is a per-cpu runqueue variant of the original BFS scheduler with
+one 8 level skiplist per runqueue, and fine grained locking for much more
+scalability.
+
+
+Goals.
+
+The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from
+here on (pronounced mux) is to completely do away with the complex designs of
+the past for the cpu process scheduler and instead implement one that is very
+simple in basic design. The main focus of MuQSS is to achieve excellent desktop
+interactivity and responsiveness without heuristics and tuning knobs that are
+difficult to understand, impossible to model and predict the effect of, and when
+tuned to one workload cause massive detriment to another, while still being
+scalable to many CPUs and processes.
+
+
+Design summary.
+
+MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1)
+lookup, earliest effective virtual deadline first tickless design, loosely based
+on EEVDF (earliest eligible virtual deadline first) and my previous Staircase
+Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler.
+Each component shall be described in order to understand the significance of,
+and reasoning for it.
+
+
+Design reasoning.
+
+In BFS, the use of a single runqueue across all CPUs meant that each CPU would
+need to scan the entire runqueue looking for the process with the earliest
+deadline and schedule that next, regardless of which CPU it originally came
+from. This made BFS deterministic with respect to latency and provided
+guaranteed latencies dependent on number of processes and CPUs. The single
+runqueue, however, meant that all CPUs would compete for the single lock
+protecting it, which would lead to increasing lock contention as the number of
+CPUs rose and appeared to limit scalability of common workloads beyond 16
+logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously
+increased overhead proportionate to the number of queued proecesses and led to
+cache thrashing while iterating over the linked list.
+
+MuQSS is an evolution of BFS, designed to maintain the same scheduling
+decision mechanism and be virtually deterministic without relying on the
+constrained design of the single runqueue by splitting out the single runqueue
+to be per-CPU and use skiplists instead of linked lists.
+
+The original reason for going back to a single runqueue design for BFS was that
+once multiple runqueues are introduced, per-CPU or otherwise, there will be
+complex interactions as each runqueue will be responsible for the scheduling
+latency and fairness of the tasks only on its own runqueue, and to achieve
+fairness and low latency across multiple CPUs, any advantage in throughput of
+having CPU local tasks causes other disadvantages. This is due to requiring a
+very complex balancing system to at best achieve some semblance of fairness
+across CPUs and can only maintain relatively low latency for tasks bound to the
+same CPUs, not across them. To increase said fairness and latency across CPUs,
+the advantage of local runqueue locking, which makes for better scalability, is
+lost due to having to grab multiple locks.
+
+MuQSS works around the problems inherent in multiple runqueue designs by
+making its skip lists priority ordered and through novel use of lockless
+examination of each other runqueue it can decide if it should take the earliest
+deadline task from another runqueue for latency reasons, or for CPU balancing
+reasons. It still does not have a balancing system, choosing to allow the
+next task scheduling decision and task wakeup CPU choice to allow balancing to
+happen by virtue of its choices.
+
+As a further evolution of the design, MuQSS normally configures sharing of
+runqueues in a logical fashion for when CPU resources are shared for improved
+latency and throughput. By default it shares runqueues and locks between
+multicore siblings. Optionally it can be configured to run with sharing of
+SMT siblings only, all SMP packages or no sharing at all. Additionally it can
+be selected at boot time.
+
+
+Design details.
+
+Custom skip list implementation:
+
+To avoid the overhead of building up and tearing down skip list structures,
+the variant used by MuQSS has a number of optimisations making it specific for
+its use case in the scheduler. It uses static arrays of 8 'levels' instead of
+building up and tearing down structures dynamically. This makes each runqueue
+only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU
+it means that it scales O(log N) up to 64k x number of logical CPUs which is
+far beyond the realistic task limits each CPU could handle. By being 8 levels
+it also makes the array exactly one cacheline in size. Additionally, each
+skip list node is bidirectional making insertion and removal amortised O(1),
+being O(k) where k is 1-8. Uniquely, we are only ever interested in the very
+first entry in each list at all times with MuQSS, so there is never a need to
+do a search and thus look up is always O(1). In interactive mode, the queues
+will be searched beyond their first entry if the first task is not suitable
+for affinity or SMT nice reasons.
+
+Task insertion:
+
+MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into
+a custom skip list as described above (based on the original design by William
+Pugh). Insertion is ordered in such a way that there is never a need to do a
+search by ordering tasks according to static priority primarily, and then
+virtual deadline at the time of insertion.
+
+Niffies:
+
+Niffies are a monotonic forward moving timer not unlike the "jiffies" but are
+of nanosecond resolution. Niffies are calculated per-runqueue from the high
+resolution TSC timers, and in order to maintain fairness are synchronised
+between CPUs whenever both runqueues are locked concurrently.
+
+Virtual deadline:
+
+The key to achieving low latency, scheduling fairness, and "nice level"
+distribution in MuQSS is entirely in the virtual deadline mechanism. The one
+tunable in MuQSS is the rr_interval, or "round robin interval". This is the
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
+tasks of the same nice level will be running for, or looking at it the other
+way around, the longest duration two tasks of the same nice level will be
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
+equal to the rr_interval and a virtual deadline. The virtual deadline is
+offset from the current time in niffies by this equation:
+
+ niffies + (prio_ratio * rr_interval)
+
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
+and increases by 10% per nice level. The deadline is a virtual one only in that
+no guarantee is placed that a task will actually be scheduled by this time, but
+it is used to compare which task should go next. There are three components to
+how a task is next chosen. First is time_slice expiration. If a task runs out
+of its time_slice, it is descheduled, the time_slice is refilled, and the
+deadline reset to that formula above. Second is sleep, where a task no longer
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
+adjusted in this case and are just carried over for when the task is next
+scheduled. Third is preemption, and that is when a newly waking task is deemed
+higher priority than a currently running task on any cpu by virtue of the fact
+that it has an earlier virtual deadline than the currently running task. The
+earlier deadline is the key to which task is next chosen for the first and
+second cases.
+
+The CPU proportion of different nice tasks works out to be approximately the
+
+ (prio_ratio difference)^2
+
+The reason it is squared is that a task's deadline does not change while it is
+running unless it runs out of time_slice. Thus, even if the time actually
+passes the deadline of another task that is queued, it will not get CPU time
+unless the current running task deschedules, and the time "base" (niffies) is
+constantly moving.
+
+Task lookup:
+
+As tasks are already pre-ordered according to anticipated scheduling order in
+the skip lists, lookup for the next suitable task per-runqueue is always a
+matter of simply selecting the first task in the 0th level skip list entry.
+In order to maintain optimal latency and fairness across CPUs, MuQSS does a
+novel examination of every other runqueue in cache locality order, choosing the
+best task across all runqueues. This provides near-determinism of how long any
+task across the entire system may wait before receiving CPU time. The other
+runqueues are first examine lockless and then trylocked to minimise the
+potential lock contention if they are likely to have a suitable better task.
+Each other runqueue lock is only held for as long as it takes to examine the
+entry for suitability. In "interactive" mode, the default setting, MuQSS will
+look for the best deadline task across all CPUs, while in !interactive mode,
+it will only select a better deadline task from another CPU if it is more
+heavily laden than the current one.
+
+Lookup is therefore O(k) where k is number of CPUs.
+
+
+Latency.
+
+Through the use of virtual deadlines to govern the scheduling order of normal
+tasks, queue-to-activation latency per runqueue is guaranteed to be bound by
+the rr_interval tunable which is set to 6ms by default. This means that the
+longest a CPU bound task will wait for more CPU is proportional to the number
+of running tasks and in the common case of 0-2 running tasks per CPU, will be
+under the 7ms threshold for human perception of jitter. Additionally, as newly
+woken tasks will have an early deadline from their previous runtime, the very
+tasks that are usually latency sensitive will have the shortest interval for
+activation, usually preempting any existing CPU bound tasks.
+
+Tickless expiry:
+
+A feature of MuQSS is that it is not tied to the resolution of the chosen tick
+rate in Hz, instead depending entirely on the high resolution timers where
+possible for sub-millisecond accuracy on timeouts regarless of the underlying
+tick rate. This allows MuQSS to be run with the low overhead of low Hz rates
+such as 100 by default, benefiting from the improved throughput and lower
+power usage it provides. Another advantage of this approach is that in
+combination with the Full No HZ option, which disables ticks on running task
+CPUs instead of just idle CPUs, the tick can be disabled at all times
+regardless of how many tasks are running instead of being limited to just one
+running task. Note that this option is NOT recommended for regular desktop
+users.
+
+
+Scalability and balancing.
+
+Unlike traditional approaches where balancing is a combination of CPU selection
+at task wakeup and intermittent balancing based on a vast array of rules set
+according to architecture, busyness calculations and special case management,
+MuQSS indirectly balances on the fly at task wakeup and next task selection.
+During initialisation, MuQSS creates a cache coherency ordered list of CPUs for
+each logical CPU and uses this to aid task/CPU selection when CPUs are busy.
+Additionally it selects any idle CPUs, if they are available, at any time over
+busy CPUs according to the following preference:
+
+ * Same thread, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+
+Mux is therefore SMT, MC and Numa aware without the need for extra
+intermittent balancing to maintain CPUs busy and make the most of cache
+coherency.
+
+
+Features
+
+As the initial prime target audience for MuQSS was the average desktop user, it
+was designed to not need tweaking, tuning or have features set to obtain benefit
+from it. Thus the number of knobs and features has been kept to an absolute
+minimum and should not require extra user input for the vast majority of cases.
+There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval,
+interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO
+policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS
+does _not_ now feature is support for CGROUPS. The average user should neither
+need to know what these are, nor should they need to be using them to have good
+desktop behaviour. However since some applications refuse to work without
+cgroups, one can enable them with MuQSS as a stub and the filesystem will be
+created which will allow the applications to work.
+
+rr_interval:
+
+ /proc/sys/kernel/rr_interval
+
+The value is in milliseconds, and the default value is set to 6. Valid values
+are from 1 to 1000 Decreasing the value will decrease latencies at the cost of
+decreasing throughput, while increasing it will improve throughput, but at the
+cost of worsening latencies. It is based on the fact that humans can detect
+jitter at approximately 7ms, so aiming for much lower latencies is pointless
+under most circumstances. It is worth noting this fact when comparing the
+latency performance of MuQSS to other schedulers. Worst case latencies being
+higher than 7ms are far worse than average latencies not being in the
+microsecond range.
+
+interactive:
+
+ /proc/sys/kernel/interactive
+
+The value is a simple boolean of 1 for on and 0 for off and is set to on by
+default. Disabling this will disable the near-determinism of MuQSS when
+selecting the next task by not examining all CPUs for the earliest deadline
+task, or which CPU to wake to, instead prioritising CPU balancing for improved
+throughput. Latency will still be bound by rr_interval, but on a per-CPU basis
+instead of across the whole system.
+
+Runqueue sharing.
+
+By default MuQSS chooses to share runqueue resources (specifically the skip
+list and locking) between multicore siblings. It is configurable at build time
+to select between None, SMT, MC and SMP, corresponding to no sharing, sharing
+only between simultaneous mulithreading siblings, multicore siblings, or
+symmetric multiprocessing physical packages. Additionally it can be se at
+bootime with the use of the rqshare parameter. The reason for configurability
+is that some architectures have CPUs with many multicore siblings (>= 16)
+where it may be detrimental to throughput to share runqueues and another
+sharing option may be desirable. Additionally, more sharing than usual can
+improve latency on a system-wide level at the expense of throughput if desired.
+
+The options are:
+none, smt, mc, smp
+
+eg:
+ rqshare=mc
+
+Isochronous scheduling:
+
+Isochronous scheduling is a unique scheduling policy designed to provide
+near-real-time performance to unprivileged (ie non-root) users without the
+ability to starve the machine indefinitely. Isochronous tasks (which means
+"same time") are set using, for example, the schedtool application like so:
+
+ schedtool -I -e amarok
+
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
+rate). However if ISO tasks run for more than a tunable finite amount of time,
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
+time is the percentage of CPU available per CPU, configurable as a percentage in
+the following "resource handling" tunable (as opposed to a scheduler tunable):
+
+iso_cpu:
+
+ /proc/sys/kernel/iso_cpu
+
+and is set to 70% by default. It is calculated over a rolling 5 second average
+Because it is the total CPU available, it means that on a multi CPU machine, it
+is possible to have an ISO task running as realtime scheduling indefinitely on
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
+ability to run any pseudo-realtime tasks.
+
+A feature of MuQSS is that it detects when an application tries to obtain a
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
+appropriate privileges to use those policies. When it detects this, it will
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
+
+
+Idleprio scheduling:
+
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
+ultra low priority tasks to be run in the background that have virtually no
+effect on the foreground tasks. This is ideally suited to distributed computing
+clients (like setiathome, folding, mprime etc) but can also be used to start a
+video encode or so on without any slowdown of other tasks. To avoid this policy
+from grabbing shared resources and holding them indefinitely, if it detects a
+state where the task is waiting on I/O, the machine is about to suspend to ram
+and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has
+been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without
+superuser privileges since it is effectively a lower scheduling policy. Tasks
+can be set to start as SCHED_IDLEPRIO with the schedtool command like so:
+
+schedtool -D -e ./mprime
+
+Subtick accounting:
+
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
+the accounting is done by simply determining what is happening at the precise
+moment a timer tick fires off. This becomes increasingly inaccurate as the timer
+tick frequency (HZ) is lowered. It is possible to create an application which
+uses almost 100% CPU, yet by being descheduled at the right time, records zero
+CPU usage. While the main problem with this is that there are possible security
+implications, it is also difficult to determine how much CPU a task really does
+use. Mux uses sub-tick accounting from the TSC clock to determine real CPU
+usage. Thus, the amount of CPU reported as being used by MuQSS will more
+accurately represent how much CPU the task itself is using (as is shown for
+example by the 'time' application), so the reported values may be quite
+different to other schedulers. When comparing throughput of MuQSS to other
+designs, it is important to compare the actual completed work in terms of total
+wall clock time taken and total work done, rather than the reported "cpu usage".
+
+Symmetric MultiThreading (SMT) aware nice:
+
+SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the
+logical CPU count rises by adding thread units to each CPU core, allowing more
+than one task to be run simultaneously on the same core, the disadvantage of it
+is that the CPU power is shared between the tasks, not summating to the power
+of two CPUs. The practical upshot of this is that two tasks running on
+separate threads of the same core run significantly slower than if they had one
+core each to run on. While smart CPU selection allows each task to have a core
+to itself whenever available (as is done on MuQSS), it cannot offset the
+slowdown that occurs when the cores are all loaded and only a thread is left.
+Most of the time this is harmless as the CPU is effectively overloaded at this
+point and the extra thread is of benefit. However when running a niced task in
+the presence of an un-niced task (say nice 19 v nice 0), the nice task gets
+precisely the same amount of CPU power as the unniced one. MuQSS has an
+optional configuration feature known as SMT-NICE which selectively idles the
+secondary niced thread for a period proportional to the nice difference,
+allowing CPU distribution according to nice level to be maintained, at the
+expense of a small amount of extra overhead. If this is configured in on a
+machine without SMT threads, the overhead is minimal.
+
+
+Con Kolivas <kernel@kolivas.org> Sat, 29th October 2016
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 5998106faa600b..e1a96348428993 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -674,6 +674,8 @@ config HZ
default 1200 if HZ_1200
default 1024
+source "kernel/Kconfig.MuQSS"
+
config SRM_ENV
tristate "SRM environment through procfs"
depends on PROC_FS
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 24804f11302d7f..2c31f181c49a80 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1197,6 +1197,8 @@ config SCHED_SMT
MultiThreading at a cost of slightly increased overhead in some
places. If unsure say N here.
+source "kernel/Kconfig.MuQSS"
+
config HAVE_ARM_SCU
bool
help
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f1d8566bbf95c..120ef842092897 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1006,6 +1006,8 @@ config SCHED_SMT
MultiThreading at a cost of slightly increased overhead in some
places. If unsure say N here.
+source "kernel/Kconfig.MuQSS"
+
config NR_CPUS
int "Maximum number of CPUs (2-4096)"
range 2 4096
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 088dd2afcfe471..bf97c467a12a23 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -873,6 +873,8 @@ config SCHED_SMT
when dealing with POWER5 cpus at a cost of slightly increased
overhead in some places. If unsure say N here.
+source "kernel/Kconfig.MuQSS"
+
config PPC_DENORMALISATION
bool "PowerPC denormalisation exception handling"
depends on PPC_BOOK3S_64
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 369206489895a2..6161cf738f6497 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -51,11 +51,6 @@ static struct task_struct *spusched_task;
static struct timer_list spusched_timer;
static struct timer_list spuloadavg_timer;
-/*
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
- */
-#define NORMAL_PRIO 120
-
/*
* Frequency of the spu scheduler tick. By default we do one SPU scheduler
* tick for every 10 CPU scheduler ticks.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0045e1b4419021..eababe7cde9376 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1010,6 +1010,22 @@ config NR_CPUS
config SCHED_SMT
def_bool y if SMP
+config SMT_NICE
+ bool "SMT (Hyperthreading) aware nice priority and policy support"
+ depends on SCHED_MUQSS && SCHED_SMT
+ default y
+ help
+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness
+ of the use of 'nice' levels and different scheduling policies
+ (e.g. realtime) due to sharing of CPU power between hyperthreads.
+ SMT nice support makes each logical CPU aware of what is running on
+ its hyperthread siblings, maintaining appropriate distribution of
+ CPU according to nice levels and scheduling policies at the expense
+ of slightly increased overhead.
+
+ If unsure say Y here.
+
+
config SCHED_MC
def_bool y
prompt "Multi-core scheduler support"
@@ -1040,6 +1056,8 @@ config SCHED_MC_PRIO
If unsure say Y here.
+source "kernel/Kconfig.MuQSS"
+
config UP_LATE_INIT
def_bool y
depends on !SMP && X86_LOCAL_APIC
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index c3038cdc686500..da7ac1fdedb8f2 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -40,6 +40,7 @@ choice
default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ
default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if ARM64 || ARM
default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if X86_INTEL_PSTATE && SMP
+ default CPU_FREQ_DEFAULT_GOV_ONDEMAND if !X86_INTEL_PSTATE
default CPU_FREQ_DEFAULT_GOV_PERFORMANCE
help
This option sets which CPUFreq governor shall be loaded at
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9cbd915025ad75..f4f05b4cb2af72 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -476,7 +476,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
seq_puts(m, "0 0 0\n");
else
seq_printf(m, "%llu %llu %lu\n",
- (unsigned long long)task->se.sum_exec_runtime,
+ (unsigned long long)tsk_seruntime(task),
(unsigned long long)task->sched_info.run_delay,
task->sched_info.pcount);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 40fc5813cf9321..c7b8dd9011311a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -35,7 +35,11 @@ extern struct cred init_cred;
#define INIT_PREV_CPUTIME(x)
#endif
+#ifdef CONFIG_SCHED_MUQSS
+#define INIT_TASK_COMM "MuQSS"
+#else
#define INIT_TASK_COMM "swapper"
+#endif
/* Attach to the init_task data structure for proper alignment */
#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index e9bfe6972aed9f..16ba1c7e5bde28 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -53,6 +53,8 @@ enum {
*/
static inline int task_nice_ioprio(struct task_struct *task)
{
+ if (iso_task(task))
+ return 0;
return (task_nice(task) + 20) / 5;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 32813c345115fb..4d25a51883d81a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -36,6 +36,10 @@
#include <linux/kcsan.h>
#include <asm/kmap_size.h>
+#ifdef CONFIG_SCHED_MUQSS
+#include <linux/skip_list.h>
+#endif
+
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
struct backing_dev_info;
@@ -677,8 +681,10 @@ struct task_struct {
unsigned int flags;
unsigned int ptrace;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS)
int on_cpu;
+#endif
+#ifdef CONFIG_SMP
struct __call_single_node wake_entry;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
@@ -704,10 +710,25 @@ struct task_struct {
int static_prio;
int normal_prio;
unsigned int rt_priority;
+#ifdef CONFIG_SCHED_MUQSS
+ int time_slice;
+ u64 deadline;
+ skiplist_node node; /* Skip list node */
+ u64 last_ran;
+ u64 sched_time; /* sched_clock time spent running */
+#ifdef CONFIG_SMT_NICE
+ int smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ bool zerobound; /* Bound to CPU0 for hotplug */
+#endif
+ unsigned long rt_timeout;
+#else /* CONFIG_SCHED_MUQSS */
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
+#endif
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
@@ -915,6 +936,10 @@ struct task_struct {
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
u64 utimescaled;
u64 stimescaled;
+#endif
+#ifdef CONFIG_SCHED_MUQSS
+ /* Unbanked cpu time */
+ unsigned long utime_ns, stime_ns;
#endif
u64 gtime;
struct prev_cputime prev_cputime;
@@ -1407,6 +1432,40 @@ struct task_struct {
*/
};
+#ifdef CONFIG_SCHED_MUQSS
+#define tsk_seruntime(t) ((t)->sched_time)
+#define tsk_rttimeout(t) ((t)->rt_timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+}
+
+void print_scheduler_version(void);
+
+static inline bool iso_task(struct task_struct *p)
+{
+ return (p->policy == SCHED_ISO);
+}
+#else /* CFS */
+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime)
+#define tsk_rttimeout(t) ((t)->rt.timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+ p->nr_cpus_allowed = current->nr_cpus_allowed;
+}
+
+static inline void print_scheduler_version(void)
+{
+ printk(KERN_INFO "CFS CPU scheduler.\n");
+}
+
+static inline bool iso_task(struct task_struct *p)
+{
+ return false;
+}
+#endif /* CONFIG_SCHED_MUQSS */
+
static inline struct pid *task_pid(struct task_struct *task)
{
return task->thread_pid;
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 1aff00b65f3cb9..73d6319a856abe 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b)
#ifdef CONFIG_SMP
struct root_domain;
+#ifdef CONFIG_SCHED_MUQSS
+static inline void dl_clear_root_domain(struct root_domain *rd)
+{
+}
+static inline void dl_add_task_root_domain(struct task_struct *p)
+{
+}
+#else /* CONFIG_SCHED_MUQSS */
extern void dl_add_task_root_domain(struct task_struct *p);
extern void dl_clear_root_domain(struct root_domain *rd);
+#endif /* CONFIG_SCHED_MUQSS */
#endif /* CONFIG_SMP */
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 6d67e9a5af6bb4..101fe470aa8fa6 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void);
static inline void nohz_balance_enter_idle(int cpu) { }
#endif
-#ifdef CONFIG_NO_HZ_COMMON
+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS)
void calc_load_nohz_start(void);
void calc_load_nohz_remote(struct rq *rq);
void calc_load_nohz_stop(void);
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index ab83d85e1183aa..0e1898e247c4c0 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -14,6 +14,12 @@
*/
#define MAX_RT_PRIO 100
+#ifdef CONFIG_SCHED_MUQSS
+#define ISO_PRIO (MAX_RT_PRIO)
+#define NORMAL_PRIO (MAX_RT_PRIO + 1)
+#define IDLE_PRIO (MAX_RT_PRIO + 2)
+#define PRIO_LIMIT ((IDLE_PRIO) + 1)
+#endif /* CONFIG_SCHED_MUQSS */
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index e5af028c08b494..010b2244e0b6a1 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)
if (policy == SCHED_FIFO || policy == SCHED_RR)
return true;
+#ifndef CONFIG_SCHED_MUQSS
if (policy == SCHED_DEADLINE)
return true;
+#endif
return false;
}
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ef02be869cf286..9e504da356c86c 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -93,7 +93,7 @@ int kernel_wait(pid_t pid, int *stat);
extern void free_task(struct task_struct *tsk);
/* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS)
extern void sched_exec(void);
#else
#define sched_exec() {}
diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h
new file mode 100644
index 00000000000000..d4be84ba273b6d
--- /dev/null
+++ b/include/linux/skip_list.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_SKIP_LISTS_H
+#define _LINUX_SKIP_LISTS_H
+typedef u64 keyType;
+typedef void *valueType;
+
+typedef struct nodeStructure skiplist_node;
+
+struct nodeStructure {
+ int level; /* Levels in this structure */
+ keyType key;
+ valueType value;
+ skiplist_node *next[8];
+ skiplist_node *prev[8];
+};
+
+typedef struct listStructure {
+ int entries;
+ int level; /* Maximum level of the list
+ (1 more than the number of levels in the list) */
+ skiplist_node *header; /* pointer to header */
+} skiplist;
+
+void skiplist_init(skiplist_node *slnode);
+skiplist *new_skiplist(skiplist_node *slnode);
+void free_skiplist(skiplist *l);
+void skiplist_node_init(skiplist_node *node);
+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed);
+void skiplist_delete(skiplist *l, skiplist_node *node);
+
+static inline bool skiplist_node_empty(skiplist_node *node) {
+ return (!node->next[0]);
+}
+#endif /* _LINUX_SKIP_LISTS_H */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab26e..f48c5c5da6511e 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -115,9 +115,16 @@ struct clone_args {
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
-/* SCHED_ISO: reserved but not implemented yet */
+/* SCHED_ISO: Implemented on MuQSS only */
#define SCHED_IDLE 5
+#ifdef CONFIG_SCHED_MUQSS
+#define SCHED_ISO 4
+#define SCHED_IDLEPRIO SCHED_IDLE
+#define SCHED_MAX (SCHED_IDLEPRIO)
+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX)
+#else /* CONFIG_SCHED_MUQSS */
#define SCHED_DEADLINE 6
+#endif /* CONFIG_SCHED_MUQSS */
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
diff --git a/init/Kconfig b/init/Kconfig
index a61c92066c2e41..d94baf3b83267b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -104,6 +104,18 @@ config THREAD_INFO_IN_TASK
menu "General setup"
+config SCHED_MUQSS
+ bool "MuQSS cpu scheduler"
+ select HIGH_RES_TIMERS
+ help
+ The Multiple Queue Skiplist Scheduler for excellent interactivity and
+ responsiveness on the desktop and highly scalable deterministic
+ low latency on any hardware.
+
+ Say Y here.
+ default y
+
+
config BROKEN
bool
@@ -522,6 +534,7 @@ config SCHED_THERMAL_PRESSURE
default y if ARM64
depends on SMP
depends on CPU_FREQ_THERMAL
+ depends on !SCHED_MUQSS
help
Select this option to enable thermal pressure accounting in the
scheduler. Thermal pressure is the value conveyed to the scheduler
@@ -871,6 +884,7 @@ config NUMA_BALANCING
depends on ARCH_SUPPORTS_NUMA_BALANCING
depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
depends on SMP && NUMA && MIGRATION
+ depends on !SCHED_MUQSS
help
This option adds support for automatic NUMA aware memory/task placement.
The mechanism is quite primitive and is based on migrating memory when
@@ -955,9 +969,13 @@ menuconfig CGROUP_SCHED
help
This feature lets CPU scheduler recognize task groups and control CPU
bandwidth allocation to such task groups. It uses cgroups to group
- tasks.
+ tasks. In combination with MuQSS this is purely a STUB to create the
+ files associated with the CPU controller cgroup but most of the
+ controls do nothing. This is useful for working in environments and
+ with applications that will only work if this control group is
+ present.
-if CGROUP_SCHED
+if CGROUP_SCHED && !SCHED_MUQSS
config FAIR_GROUP_SCHED
bool "Group scheduling for SCHED_OTHER"
depends on CGROUP_SCHED
@@ -1086,6 +1104,7 @@ config CGROUP_DEVICE
config CGROUP_CPUACCT
bool "Simple CPU accounting controller"
+ depends on !SCHED_MUQSS
help
Provides a simple controller for monitoring the
total CPU consumed by the tasks in a cgroup.
@@ -1228,6 +1247,7 @@ config CHECKPOINT_RESTORE
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
+ depends on !SCHED_MUQSS
select CGROUPS
select CGROUP_SCHED
select FAIR_GROUP_SCHED
diff --git a/init/init_task.c b/init/init_task.c
index 8b08c2e19cbb5b..984f6fd8a5bba5 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -75,9 +75,17 @@ struct task_struct init_task
.stack = init_stack,
.usage = REFCOUNT_INIT(2),
.flags = PF_KTHREAD,
+#ifdef CONFIG_SCHED_MUQSS
+ .prio = NORMAL_PRIO,
+ .static_prio = MAX_PRIO - 20,
+ .normal_prio = NORMAL_PRIO,
+ .deadline = 0,
+ .time_slice = 1000000,
+#else
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
+#endif
.policy = SCHED_NORMAL,
.cpus_ptr = &init_task.cpus_mask,
.cpus_mask = CPU_MASK_ALL,
@@ -87,6 +95,7 @@ struct task_struct init_task
.restart_block = {
.fn = do_no_restart_syscall,
},
+#ifndef CONFIG_SCHED_MUQSS
.se = {
.group_node = LIST_HEAD_INIT(init_task.se.group_node),
},
@@ -94,6 +103,7 @@ struct task_struct init_task
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
.time_slice = RR_TIMESLICE,
},
+#endif
.tasks = LIST_HEAD_INIT(init_task.tasks),
#ifdef CONFIG_SMP
.pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),
diff --git a/init/main.c b/init/main.c
index e6836a9400d568..6c26f0af060a5e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1462,6 +1462,8 @@ static int __ref kernel_init(void *unused)
do_sysctl_args();
+ print_scheduler_version();
+
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS
new file mode 100644
index 00000000000000..91688dae437b4e
--- /dev/null
+++ b/kernel/Kconfig.MuQSS
@@ -0,0 +1,106 @@
+choice
+ depends on SMP
+ prompt "CPU scheduler runqueue sharing"
+ default RQ_MC if SCHED_MUQSS
+ default RQ_NONE
+
+config RQ_NONE
+ bool "No sharing"
+ help
+ This is the default behaviour where the CPU scheduler has one runqueue
+ per CPU, whether it is a physical or logical CPU (hyperthread).
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=none
+
+ If unsure, say N.
+
+config RQ_SMT
+ bool "SMT (hyperthread) siblings"
+ depends on SCHED_SMT && SCHED_MUQSS
+
+ help
+ With this option enabled, the CPU scheduler will have one runqueue
+ shared by SMT (hyperthread) siblings. As these logical cores share
+ one physical core, sharing the runqueue resource can lead to decreased
+ overhead, lower latency and higher throughput.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=smt
+
+ If unsure, say N.
+
+config RQ_MC
+ bool "Multicore siblings"
+ depends on SCHED_MC && SCHED_MUQSS
+ help
+ With this option enabled, the CPU scheduler will have one runqueue
+ shared by multicore siblings in addition to any SMT siblings.
+ As these physical cores share caches, sharing the runqueue resource
+ will lead to lower latency, but its effects on overhead and throughput
+ are less predictable. As a general rule, 6 or fewer cores will likely
+ benefit from this, while larger CPUs will only derive a latency
+ benefit. If your workloads are primarily single threaded, this will
+ possibly worsen throughput. If you are only concerned about latency
+ then enable this regardless of how many cores you have.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=mc
+
+ If unsure, say Y.
+
+config RQ_MC_LLC
+ bool "Multicore siblings (LLC)"
+ depends on SCHED_MC && SCHED_MUQSS
+ help
+ With this option enabled, the CPU scheduler will behave similarly as
+ with "Multicore siblings".
+ This option takes LLC cache into account when scheduling tasks.
+ Option may benefit CPUs with multiple LLC caches, such as Ryzen
+ and Xeon CPUs.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=llc
+
+ If unsure, say N.
+
+config RQ_SMP
+ bool "Symmetric Multi-Processing"
+ depends on SMP && SCHED_MUQSS
+ help
+ With this option enabled, the CPU scheduler will have one runqueue
+ shared by all physical CPUs unless they are on separate NUMA nodes.
+ As physical CPUs usually do not share resources, sharing the runqueue
+ will normally worsen throughput but improve latency. If you only
+ care about latency enable this.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=smp
+
+ If unsure, say N.
+
+config RQ_ALL
+ bool "NUMA"
+ depends on SMP && SCHED_MUQSS
+ help
+ With this option enabled, the CPU scheduler will have one runqueue
+ regardless of the architecture configuration, including across NUMA
+ nodes. This can substantially decrease throughput in NUMA
+ configurations, but light NUMA designs will not be dramatically
+ affected. This option should only be chosen if latency is the prime
+ concern.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=all
+
+ If unsure, say N.
+endchoice
+
+config SHARERQ
+ int
+ default 0 if RQ_NONE
+ default 1 if RQ_SMT
+ default 2 if RQ_MC
+ default 3 if RQ_MC_LLC
+ default 4 if RQ_SMP
+ default 5 if RQ_ALL
diff --git a/kernel/Makefile b/kernel/Makefile
index 4df609be42d07c..747d552a2dab46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o ucount.o regset.o
+ async.o range.o smpboot.o ucount.o regset.o \
+ skip_list.o
obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
obj-$(CONFIG_MODULES) += kmod.o
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 27725754ac9919..769d773c7182af 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
*/
t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
- t3 = tsk->se.sum_exec_runtime;
+ t3 = tsk_seruntime(tsk);
d->cpu_count += t1;
diff --git a/kernel/exit.c b/kernel/exit.c
index 65809fac303875..9504db57d8786c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->curr_target = next_thread(tsk);
}
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+ add_device_randomness((const void*) &tsk_seruntime(tsk),
sizeof(unsigned long long));
/*
@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
- sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
+ sig->sum_sched_runtime += tsk_seruntime(tsk);
sig->nr_threads--;
__unhash_process(tsk, group_dead);
write_sequnlock(&sig->stats_lock);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6e02094849d303..20498d637adaef 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -498,6 +498,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
}
EXPORT_SYMBOL(kthread_bind);
+#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP)
+extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+
+/*
+ * new_kthread_bind is a special variant of __kthread_bind_mask.
+ * For new threads to work on muqss we want to call do_set_cpus_allowed
+ * without the task_cpu being set and the task rescheduled until they're
+ * rescheduled on their own so we call __do_set_cpus_allowed directly which
+ * only changes the cpumask. This is particularly important for smpboot threads
+ * to work.
+ */
+static void new_kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+ unsigned long flags;
+
+ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)))
+ return;
+
+ /* It's safe because the task is inactive. */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ __do_set_cpus_allowed(p, cpumask_of(cpu));
+ p->flags |= PF_NO_SETAFFINITY;
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+#else
+#define new_kthread_bind(p, cpu) kthread_bind(p, cpu)
+#endif
+
/**
* kthread_create_on_cpu - Create a cpu bound kthread
* @threadfn: the function to run until signal_pending(current).
@@ -518,7 +546,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
cpu);
if (IS_ERR(p))
return p;
- kthread_bind(p, cpu);
+ new_kthread_bind(p, cpu);
/* CPU hotplug need to bind once again when unparking the thread. */
to_kthread(p)->cpu = cpu;
return p;
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 3a4beb9395c48c..8cde4921116e32 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -283,7 +283,7 @@ static bool klp_try_switch_task(struct task_struct *task)
{
static char err_buf[STACK_ERR_BUF_SIZE];
struct rq *rq;
- struct rq_flags flags;
+ struct rq_flags rf;
int ret;
bool success = false;
@@ -305,7 +305,7 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
- rq = task_rq_lock(task, &flags);
+ rq = task_rq_lock(task, &rf);
if (task_running(rq, task) && task != current) {
snprintf(err_buf, STACK_ERR_BUF_SIZE,
@@ -324,7 +324,7 @@ static bool klp_try_switch_task(struct task_struct *task)
task->patch_state = klp_target_state;
done:
- task_rq_unlock(rq, task, &flags);
+ task_rq_unlock(rq, task, &rf);
/*
* Due to console deadlock issues, pr_debug() can't be used while
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5fc9c9b70862f7..1ff14a21193d8f 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -22,15 +22,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
+ifdef CONFIG_SCHED_MUQSS
+obj-y += MuQSS.o clock.o cputime.o
+obj-y += idle.o
+obj-y += wait.o wait_bit.o swait.o completion.o
+
+obj-$(CONFIG_SMP) += topology.o
+else
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+endif
+obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
new file mode 100644
index 00000000000000..2d231a3d001be3
--- /dev/null
+++ b/kernel/sched/MuQSS.c
@@ -0,0 +1,8320 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kernel/sched/MuQSS.c, was kernel/sched.c
+ *
+ * Kernel scheduler and related syscalls
+ *
+ * Copyright (C) 1991-2002 Linus Torvalds
+ *
+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
+ * make semaphores SMP safe
+ * 1998-11-19 Implemented schedule_timeout() and related stuff
+ * by Andrea Arcangeli
+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
+ * hybrid priority-list and round-robin design with
+ * an array-switch method of distributing timeslices
+ * and per-CPU runqueues. Cleanups and useful suggestions
+ * by Davide Libenzi, preemptible kernel bits by Robert Love.
+ * 2003-09-03 Interactivity tuning by Con Kolivas.
+ * 2004-04-02 Scheduler domains code by Nick Piggin
+ * 2007-04-15 Work begun on replacing all interactivity tuning with a
+ * fair scheduling design by Con Kolivas.
+ * 2007-05-05 Load balancing (smp-nice) and other improvements
+ * by Peter Williams
+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ * Thomas Gleixner, Mike Kravetz
+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes
+ * a whole lot of those previous things.
+ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS
+ * scheduler by Con Kolivas.
+ * 2019-08-31 LLC bits by Eduards Bezverhijs
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+#undef CREATE_TRACE_POINTS
+
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+
+#include <linux/binfmts.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
+#include <linux/context_tracking.h>
+#include <linux/cpuset.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/kcov.h>
+#include <linux/kprobes.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/prefetch.h>
+#include <linux/profile.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/scs.h>
+#include <linux/security.h>
+#include <linux/skip_list.h>
+#include <linux/syscalls.h>
+#include <linux/tick.h>
+#include <linux/wait_bit.h>
+
+#include <asm/irq_regs.h>
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+
+#include "../workqueue_internal.h"
+#include "../../fs/io-wq.h"
+#include "../smpboot.h"
+
+#include "MuQSS.h"
+#include "smp.h"
+
+#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p) rt_prio((p)->prio)
+#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \
+ (policy) == SCHED_RR)
+#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
+
+#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO)
+#define idleprio_task(p) unlikely(is_idle_policy((p)->policy))
+#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO)
+
+#define is_iso_policy(policy) ((policy) == SCHED_ISO)
+#define iso_task(p) unlikely(is_iso_policy((p)->policy))
+#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO)
+
+#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT)
+
+#define ISO_PERIOD (5 * HZ)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+#define STOP_PRIO (MAX_RT_PRIO - 1)
+
+/*
+ * Some helpers for converting to/from various scales. Use shifts to get
+ * approximate multiples of ten for less overhead.
+ */
+#define APPROX_NS_PS (1073741824) /* Approximate ns per second */
+#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ))
+#define JIFFY_NS (APPROX_NS_PS / HZ)
+#define JIFFY_US (1048576 / HZ)
+#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS)
+#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2)
+#define HALF_JIFFY_US (1048576 / HZ / 2)
+#define MS_TO_NS(TIME) ((TIME) << 20)
+#define MS_TO_US(TIME) ((TIME) << 10)
+#define NS_TO_MS(TIME) ((TIME) >> 20)
+#define NS_TO_US(TIME) ((TIME) >> 10)
+#define US_TO_NS(TIME) ((TIME) << 10)
+#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ)
+
+#define RESCHED_US (100) /* Reschedule if less than this many μs left */
+
+void print_scheduler_version(void)
+{
+ printk(KERN_INFO "MuQSS CPU scheduler v0.210 by Con Kolivas.\n");
+}
+
+/*
+ * This is the time all tasks within the same priority round robin.
+ * Value is in ms and set to a minimum of 6ms.
+ * Tunable via /proc interface.
+ */
+int rr_interval __read_mostly = 6;
+
+/*
+ * Tunable to choose whether to prioritise latency or throughput, simple
+ * binary yes or no
+ */
+int sched_interactive __read_mostly = 1;
+
+/*
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
+ * are allowed to run five seconds as real time tasks. This is the total over
+ * all online cpus.
+ */
+int sched_iso_cpu __read_mostly = 70;
+
+/*
+ * sched_yield_type - Choose what sort of yield sched_yield will perform.
+ * 0: No yield.
+ * 1: Yield only to better priority/deadline tasks. (default)
+ * 2: Expire timeslice and recalculate deadline.
+ */
+int sched_yield_type __read_mostly = 1;
+
+/*
+ * The relative length of deadline for each priority(nice) level.
+ */
+static int prio_ratios[NICE_WIDTH] __read_mostly;
+
+
+/*
+ * The quota handed out to tasks of all priority levels when refilling their
+ * time_slice.
+ */
+static inline int timeslice(void)
+{
+ return MS_TO_US(rr_interval);
+}
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+#ifdef CONFIG_SMP
+
+/* Define RQ share levels */
+#define RQSHARE_NONE 0
+#define RQSHARE_SMT 1
+#define RQSHARE_MC 2
+#define RQSHARE_MC_LLC 3
+#define RQSHARE_SMP 4
+#define RQSHARE_ALL 5
+
+/* Define locality levels */
+#define LOCALITY_SAME 0
+#define LOCALITY_SMT 1
+#define LOCALITY_MC_LLC 2
+#define LOCALITY_MC 3
+#define LOCALITY_SMP 4
+#define LOCALITY_DISTANT 5
+
+/*
+ * This determines what level of runqueue sharing will be done and is
+ * configurable at boot time with the bootparam rqshare =
+ */
+static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */
+
+static int __init set_rqshare(char *str)
+{
+ if (!strncmp(str, "none", 4)) {
+ rqshare = RQSHARE_NONE;
+ return 1;
+ }
+ if (!strncmp(str, "smt", 3)) {
+ rqshare = RQSHARE_SMT;
+ return 1;
+ }
+ if (!strncmp(str, "mc", 2)) {
+ rqshare = RQSHARE_MC;
+ return 1;
+ }
+ if (!strncmp(str, "llc", 3)) {
+ rqshare = RQSHARE_MC_LLC;
+ return 1;
+ }
+ if (!strncmp(str, "smp", 3)) {
+ rqshare = RQSHARE_SMP;
+ return 1;
+ }
+ if (!strncmp(str, "all", 3)) {
+ rqshare = RQSHARE_ALL;
+ return 1;
+ }
+ return 0;
+}
+__setup("rqshare=", set_rqshare);
+
+/*
+ * Total number of runqueues. Equals number of CPUs when there is no runqueue
+ * sharing but is usually less with SMT/MC sharing of runqueues.
+ */
+static int total_runqueues __read_mostly = 1;
+
+static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
+
+struct rq *cpu_rq(int cpu)
+{
+ return &per_cpu(runqueues, (cpu));
+}
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+
+/*
+ * For asym packing, by default the lower numbered cpu has higher priority.
+ */
+int __weak arch_asym_cpu_priority(int cpu)
+{
+ return -cpu;
+}
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+ return 0*SD_ASYM_PACKING;
+}
+
+#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+EXPORT_SYMBOL_GPL(sched_smt_present);
+#endif
+
+#else
+struct rq *uprq;
+#endif /* CONFIG_SMP */
+
+#include "stats.h"
+
+/*
+ * All common locking functions performed on rq->lock. rq->clock is local to
+ * the CPU accessing it so it can be modified just with interrupts disabled
+ * when we're not updating niffies.
+ * Looking up task_rq must be done under rq->lock to be safe.
+ */
+
+/*
+ * RQ-clock updating methods:
+ */
+
+#ifdef HAVE_SCHED_AVG_IRQ
+static void update_irq_load_avg(struct rq *rq, long delta);
+#else
+static inline void update_irq_load_avg(struct rq *rq, long delta) {}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+ s64 __maybe_unused steal = 0, irq_delta = 0;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ rq->prev_steal_time_rq += steal;
+ delta -= steal;
+ }
+#endif
+ rq->clock_task += delta;
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+ if (irq_delta + steal)
+ update_irq_load_avg(rq, irq_delta + steal);
+#endif
+}
+
+static inline void update_rq_clock(struct rq *rq)
+{
+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+
+ if (unlikely(delta < 0))
+ return;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
+}
+
+/*
+ * Niffies are a globally increasing nanosecond counter. They're only used by
+ * update_load_avg and time_slice_expired, however deadlines are based on them
+ * across CPUs. Update them whenever we will call one of those functions, and
+ * synchronise them across CPUs whenever we hold both runqueue locks.
+ */
+static inline void update_clocks(struct rq *rq)
+{
+ s64 ndiff, minndiff;
+ long jdiff;
+
+ update_rq_clock(rq);
+ ndiff = rq->clock - rq->old_clock;
+ rq->old_clock = rq->clock;
+ jdiff = jiffies - rq->last_jiffy;
+
+ /* Subtract any niffies added by balancing with other rqs */
+ ndiff -= rq->niffies - rq->last_niffy;
+ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies;
+ if (minndiff < 0)
+ minndiff = 0;
+ ndiff = max(ndiff, minndiff);
+ rq->niffies += ndiff;
+ rq->last_niffy = rq->niffies;
+ if (jdiff) {
+ rq->last_jiffy += jdiff;
+ rq->last_jiffy_niffies = rq->niffies;
+ }
+}
+
+/*
+ * Any time we have two runqueues locked we use that as an opportunity to
+ * synchronise niffies to the highest value as idle ticks may have artificially
+ * kept niffies low on one CPU and the truth can only be later.
+ */
+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2)
+{
+ if (rq1->niffies > rq2->niffies)
+ rq2->niffies = rq1->niffies;
+ else
+ rq1->niffies = rq2->niffies;
+}
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+
+/* For when we know rq1 != rq2 */
+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ if (rq1 < rq2) {
+ raw_spin_lock(rq1->lock);
+ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ raw_spin_lock(rq2->lock);
+ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ if (rq1->lock == rq2->lock) {
+ raw_spin_lock(rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+ } else
+ __double_rq_lock(rq1, rq2);
+ synchronise_niffies(rq1, rq2);
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ raw_spin_unlock(rq1->lock);
+ if (rq1->lock != rq2->lock)
+ raw_spin_unlock(rq2->lock);
+ else
+ __release(rq2->lock);
+}
+
+static inline void lock_all_rqs(void)
+{
+ int cpu;
+
+ preempt_disable();
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ do_raw_spin_lock(rq->lock);
+ }
+}
+
+static inline void unlock_all_rqs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ do_raw_spin_unlock(rq->lock);
+ }
+ preempt_enable();
+}
+
+/* Specially nest trylock an rq */
+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq)
+{
+ if (unlikely(!do_raw_spin_trylock(rq->lock)))
+ return false;
+ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_);
+ synchronise_niffies(this_rq, rq);
+ return true;
+}
+
+/* Unlock a specially nested trylocked rq */
+static inline void unlock_rq(struct rq *rq)
+{
+ spin_release(&rq->lock->dep_map, _RET_IP_);
+ do_raw_spin_unlock(rq->lock);
+}
+
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, mask) \
+ ({ \
+ typeof(ptr) _ptr = (ptr); \
+ typeof(mask) _mask = (mask); \
+ typeof(*_ptr) _old, _val = *_ptr; \
+ \
+ for (;;) { \
+ _old = cmpxchg(_ptr, _val, _val | _mask); \
+ if (_old == _val) \
+ break; \
+ _val = _old; \
+ } \
+ _old; \
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
+#endif
+
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ struct wake_q_node *node = &task->wake_q;
+
+ /*
+ * Atomically grab the task, if ->wake_q is !nil already it means
+ * it's already queued (either by us or someone else) and will get the
+ * wakeup due to that.
+ *
+ * In order to ensure that a pending wakeup will observe our pending
+ * state, even in the failed case, an explicit smp_mb() must be used.
+ */
+ smp_mb__before_atomic();
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
+ return false;
+
+ /*
+ * The head is context local, there can be no concurrency.
+ */
+ *head->lastp = node;
+ head->lastp = &node->next;
+ return true;
+}
+
+/**
+ * wake_q_add() - queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ */
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ if (__wake_q_add(head, task))
+ get_task_struct(task);
+}
+
+/**
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ *
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
+ * that already hold reference to @task can call the 'safe' version and trust
+ * wake_q to do the right thing depending whether or not the @task is already
+ * queued for wakeup.
+ */
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
+{
+ if (!__wake_q_add(head, task))
+ put_task_struct(task);
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+ struct wake_q_node *node = head->first;
+
+ while (node != WAKE_Q_TAIL) {
+ struct task_struct *task;
+
+ task = container_of(node, struct task_struct, wake_q);
+ BUG_ON(!task);
+ /* Task can safely be re-inserted now */
+ node = node->next;
+ task->wake_q.next = NULL;
+
+ /*
+ * wake_up_process() executes a full barrier, which pairs with
+ * the queueing in wake_q_add() so as not to miss wakeups.
+ */
+ wake_up_process(task);
+ put_task_struct(task);
+ }
+}
+
+static inline void smp_sched_reschedule(int cpu)
+{
+ if (likely(cpu_online(cpu)))
+ smp_send_reschedule(cpu);
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
+{
+ int cpu;
+#ifdef CONFIG_LOCKDEP
+ /* Kernel threads call this when creating workqueues while still
+ * inactive from __kthread_bind_mask, holding only the pi_lock */
+ if (!(p->flags & PF_KTHREAD)) {
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(rq->lock);
+ }
+#endif
+ if (test_tsk_need_resched(p))
+ return;
+
+ cpu = task_cpu(p);
+ if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(p);
+ set_preempt_need_resched();
+ return;
+ }
+
+ if (set_nr_and_not_polling(p))
+ smp_sched_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+/*
+ * A task that is not running or queued will not have a node set.
+ * A task that is queued but not running will have a node set.
+ * A task that is currently running will have ->on_cpu set but no node set.
+ */
+static inline bool task_queued(struct task_struct *p)
+{
+ return !skiplist_node_empty(&p->node);
+}
+
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+static inline void resched_if_idle(struct rq *rq);
+
+static inline bool deadline_before(u64 deadline, u64 time)
+{
+ return (deadline < time);
+}
+
+/*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+ return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+ return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+ return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+ return NS_TO_MS(longest_deadline_diff());
+}
+
+static inline bool rq_local(struct rq *rq);
+
+#ifndef SCHED_CAPACITY_SCALE
+#define SCHED_CAPACITY_SCALE 1024
+#endif
+
+static inline int rq_load(struct rq *rq)
+{
+ return rq->nr_running;
+}
+
+/*
+ * Update the load average for feeding into cpu frequency governors. Use a
+ * rough estimate of a rolling average with ~ time constant of 32ms.
+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
+ * Make sure a call to update_clocks has been made before calling this to get
+ * an updated rq->niffies.
+ */
+static void update_load_avg(struct rq *rq, unsigned int flags)
+{
+ long us_interval, load;
+
+ us_interval = NS_TO_US(rq->niffies - rq->load_update);
+ if (unlikely(us_interval <= 0))
+ return;
+
+ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
+ if (unlikely(load < 0))
+ load = 0;
+ load += rq_load(rq) * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144;
+ rq->load_avg = load;
+
+ rq->load_update = rq->niffies;
+ update_irq_load_avg(rq, 0);
+ if (likely(rq_local(rq)))
+ cpufreq_trigger(rq, flags);
+}
+
+#ifdef HAVE_SCHED_AVG_IRQ
+/*
+ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds
+ * here so we scale curload to how long it's been since the last update.
+ */
+static void update_irq_load_avg(struct rq *rq, long delta)
+{
+ long us_interval, load;
+
+ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update);
+ if (unlikely(us_interval <= 0))
+ return;
+
+ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144);
+ if (unlikely(load < 0))
+ load = 0;
+ load += NS_TO_US(delta) * SCHED_CAPACITY_SCALE * 5 / 262144;
+ rq->irq_load_avg = load;
+
+ rq->irq_load_update = rq->niffies;
+}
+#endif
+
+/*
+ * Removing from the runqueue. Enter with rq locked. Deleting a task
+ * from the skip list is done via the stored node reference in the task struct
+ * and does not require a full look up. Thus it occurs in O(k) time where k
+ * is the "level" of the list the task was stored at - usually < 4, max 8.
+ */
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ skiplist_delete(rq->sl, &p->node);
+ rq->best_key = rq->node->next[0]->key;
+ update_clocks(rq);
+
+ if (!(flags & DEQUEUE_SAVE)) {
+ sched_info_dequeued(rq, p);
+ psi_dequeue(p, flags & DEQUEUE_SLEEP);
+ }
+ rq->nr_running--;
+ if (rt_task(p))
+ rq->rt_nr_running--;
+ update_load_avg(rq, flags);
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+static bool rcu_read_critical(struct task_struct *p)
+{
+ return p->rcu_read_unlock_special.b.blocked;
+}
+#else /* CONFIG_PREEMPT_RCU */
+#define rcu_read_critical(p) (false)
+#endif /* CONFIG_PREEMPT_RCU */
+
+/*
+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
+ * an idle task, we ensure none of the following conditions are met.
+ */
+static bool idleprio_suitable(struct task_struct *p)
+{
+ return (!(p->sched_contributes_to_load) && !(p->flags & (PF_EXITING)) &&
+ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
+}
+
+/*
+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
+ * that the iso_refractory flag is not set.
+ */
+static inline bool isoprio_suitable(struct rq *rq)
+{
+ return !rq->iso_refractory;
+}
+
+static inline void inc_nr_running(struct rq *rq)
+{
+ rq->nr_running++;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, 1);
+ }
+}
+
+static inline void dec_nr_running(struct rq *rq)
+{
+ rq->nr_running--;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, -1);
+ }
+}
+
+/*
+ * Adding to the runqueue. Enter with rq locked.
+ */
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ unsigned int randseed, cflags = 0;
+ u64 sl_id;
+
+ if (!rt_task(p)) {
+ /* Check it hasn't gotten rt from PI */
+ if ((idleprio_task(p) && idleprio_suitable(p)) ||
+ (iso_task(p) && isoprio_suitable(rq)))
+ p->prio = p->normal_prio;
+ else
+ p->prio = NORMAL_PRIO;
+ } else
+ rq->rt_nr_running++;
+ /*
+ * The sl_id key passed to the skiplist generates a sorted list.
+ * Realtime and sched iso tasks run FIFO so they only need be sorted
+ * according to priority. The skiplist will put tasks of the same
+ * key inserted later in FIFO order. Tasks of sched normal, batch
+ * and idleprio are sorted according to their deadlines. Idleprio
+ * tasks are offset by an impossibly large deadline value ensuring
+ * they get sorted into last positions, but still according to their
+ * own deadlines. This creates a "landscape" of skiplists running
+ * from priority 0 realtime in first place to the lowest priority
+ * idleprio tasks last. Skiplist insertion is an O(log n) process.
+ */
+ if (p->prio <= ISO_PRIO) {
+ sl_id = p->prio;
+ } else {
+ sl_id = p->deadline;
+ if (idleprio_task(p)) {
+ if (p->prio == IDLE_PRIO)
+ sl_id |= 0xF000000000000000;
+ else
+ sl_id += longest_deadline_diff();
+ }
+ }
+ /*
+ * Some architectures don't have better than microsecond resolution
+ * so mask out ~microseconds as the random seed for skiplist insertion.
+ */
+ update_clocks(rq);
+ if (!(flags & ENQUEUE_RESTORE)) {
+ sched_info_queued(rq, p);
+ psi_enqueue(p, flags & ENQUEUE_WAKEUP);
+ }
+
+ randseed = (rq->niffies >> 10) & 0xFFFFFFFF;
+ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed);
+ rq->best_key = rq->node->next[0]->key;
+ if (p->in_iowait)
+ cflags |= SCHED_CPUFREQ_IOWAIT;
+ inc_nr_running(rq);
+ update_load_avg(rq, cflags);
+}
+
+/*
+ * Returns the relative length of deadline all compared to the shortest
+ * deadline which is that of nice -20.
+ */
+static inline int task_prio_ratio(struct task_struct *p)
+{
+ return prio_ratios[TASK_USER_PRIO(p)];
+}
+
+/*
+ * task_timeslice - all tasks of all priorities get the exact same timeslice
+ * length. CPU distribution is handled by giving different deadlines to
+ * tasks of different priorities. Use 128 as the base value for fast shifts.
+ */
+static inline int task_timeslice(struct task_struct *p)
+{
+ return (rr_interval * task_prio_ratio(p) / 128);
+}
+
+#ifdef CONFIG_SMP
+/* Entered with rq locked */
+static inline void resched_if_idle(struct rq *rq)
+{
+ if (rq_idle(rq))
+ resched_task(rq->curr);
+}
+
+static inline bool rq_local(struct rq *rq)
+{
+ return (rq->cpu == smp_processor_id());
+}
+#ifdef CONFIG_SMT_NICE
+static const cpumask_t *thread_cpumask(int cpu);
+
+/* Find the best real time priority running on any SMT siblings of cpu and if
+ * none are running, the static priority of the best deadline task running.
+ * The lookups to the other runqueues is done lockless as the occasional wrong
+ * value would be harmless. */
+static int best_smt_bias(struct rq *this_rq)
+{
+ int other_cpu, best_bias = 0;
+
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
+ struct rq *rq = cpu_rq(other_cpu);
+
+ if (rq_idle(rq))
+ continue;
+ if (unlikely(!rq->online))
+ continue;
+ if (!rq->rq_mm)
+ continue;
+ if (likely(rq->rq_smt_bias > best_bias))
+ best_bias = rq->rq_smt_bias;
+ }
+ return best_bias;
+}
+
+static int task_prio_bias(struct task_struct *p)
+{
+ if (rt_task(p))
+ return 1 << 30;
+ else if (task_running_iso(p))
+ return 1 << 29;
+ else if (task_running_idle(p))
+ return 0;
+ return MAX_PRIO - p->static_prio;
+}
+
+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq)
+{
+ return true;
+}
+
+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule;
+
+/* We've already decided p can run on CPU, now test if it shouldn't for SMT
+ * nice reasons. */
+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq)
+{
+ int best_bias, task_bias;
+
+ /* Kernel threads always run */
+ if (unlikely(!p->mm))
+ return true;
+ if (rt_task(p))
+ return true;
+ if (!idleprio_suitable(p))
+ return true;
+ best_bias = best_smt_bias(this_rq);
+ /* The smt siblings are all idle or running IDLEPRIO */
+ if (best_bias < 1)
+ return true;
+ task_bias = task_prio_bias(p);
+ if (task_bias < 1)
+ return false;
+ if (task_bias >= best_bias)
+ return true;
+ /* Dither 25% cpu of normal tasks regardless of nice difference */
+ if (best_bias % 4 == 1)
+ return true;
+ /* Sorry, you lose */
+ return false;
+}
+#else /* CONFIG_SMT_NICE */
+#define smt_schedule(p, this_rq) (true)
+#endif /* CONFIG_SMT_NICE */
+
+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
+{
+ set_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
+/*
+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
+ * allow easy lookup of whether any suitable idle CPUs are available.
+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
+ * idle_cpus variable than to do a full bitmask check when we are busy. The
+ * bits are set atomically but read locklessly as occasional false positive /
+ * negative is harmless.
+ */
+static inline void set_cpuidle_map(int cpu)
+{
+ if (likely(cpu_online(cpu)))
+ atomic_set_cpu(cpu, &cpu_idle_map);
+}
+
+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
+{
+ clear_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+ atomic_clear_cpu(cpu, &cpu_idle_map);
+}
+
+static bool suitable_idle_cpus(struct task_struct *p)
+{
+ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map));
+}
+
+/*
+ * Resched current on rq. We don't know if rq is local to this CPU nor if it
+ * is locked so we do not use an intermediate variable for the task to avoid
+ * having it dereferenced.
+ */
+static void resched_curr(struct rq *rq)
+{
+ int cpu;
+
+ if (test_tsk_need_resched(rq->curr))
+ return;
+
+ rq->preempt = rq->curr;
+ cpu = rq->cpu;
+
+ /* We're doing this without holding the rq lock if it's not task_rq */
+
+ if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(rq->curr);
+ set_preempt_need_resched();
+ return;
+ }
+
+ if (set_nr_and_not_polling(rq->curr))
+ smp_sched_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+#define CPUIDLE_DIFF_THREAD (1)
+#define CPUIDLE_DIFF_CORE_LLC (2)
+#define CPUIDLE_DIFF_CORE (4)
+#define CPUIDLE_CACHE_BUSY (8)
+#define CPUIDLE_DIFF_CPU (16)
+#define CPUIDLE_THREAD_BUSY (32)
+#define CPUIDLE_DIFF_NODE (64)
+
+/*
+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the
+ * lowest value would give the most suitable CPU to schedule p onto next. The
+ * order works out to be the following:
+ *
+ * Same thread, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+ */
+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
+{
+ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
+ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
+ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD;
+ int cpu_tmp;
+
+ if (cpumask_test_cpu(best_cpu, tmpmask))
+ goto out;
+
+ for_each_cpu(cpu_tmp, tmpmask) {
+ int ranking, locality;
+ struct rq *tmp_rq;
+
+ ranking = 0;
+ tmp_rq = cpu_rq(cpu_tmp);
+
+ locality = rq->cpu_locality[cpu_tmp];
+#ifdef CONFIG_NUMA
+ if (locality > LOCALITY_SMP)
+ ranking |= CPUIDLE_DIFF_NODE;
+ else
+#endif
+ if (locality > LOCALITY_MC)
+ ranking |= CPUIDLE_DIFF_CPU;
+#ifdef CONFIG_SCHED_MC
+ else if (locality == LOCALITY_MC_LLC)
+ ranking |= CPUIDLE_DIFF_CORE_LLC;
+ else if (locality == LOCALITY_MC)
+ ranking |= CPUIDLE_DIFF_CORE;
+ if (!(tmp_rq->cache_idle(tmp_rq)))
+ ranking |= CPUIDLE_CACHE_BUSY;
+#endif
+#ifdef CONFIG_SCHED_SMT
+ if (locality == LOCALITY_SMT)
+ ranking |= CPUIDLE_DIFF_THREAD;
+#endif
+ if (ranking < best_ranking
+#ifdef CONFIG_SCHED_SMT
+ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq)))
+#endif
+ ) {
+ best_cpu = cpu_tmp;
+ best_ranking = ranking;
+ }
+ }
+out:
+ return best_cpu;
+}
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+ struct rq *this_rq = cpu_rq(this_cpu);
+
+ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP);
+}
+
+/* As per resched_curr but only will resched idle task */
+static inline void resched_idle(struct rq *rq)
+{
+ if (test_tsk_need_resched(rq->idle))
+ return;
+
+ rq->preempt = rq->idle;
+
+ set_tsk_need_resched(rq->idle);
+
+ if (rq_local(rq)) {
+ set_preempt_need_resched();
+ return;
+ }
+
+ smp_sched_reschedule(rq->cpu);
+}
+
+DEFINE_PER_CPU(cpumask_t, idlemask);
+
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
+{
+ cpumask_t *tmpmask = &(per_cpu(idlemask, cpu));
+ struct rq *rq;
+ int best_cpu;
+
+ cpumask_and(tmpmask, p->cpus_ptr, &cpu_idle_map);
+ best_cpu = best_mask_cpu(cpu, task_rq(p), tmpmask);
+ rq = cpu_rq(best_cpu);
+ if (!smt_schedule(p, rq))
+ return NULL;
+ rq->preempt = p;
+ resched_idle(rq);
+ return rq;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+ if (suitable_idle_cpus(p))
+ resched_best_idle(p, task_cpu(p));
+}
+
+static inline struct rq *rq_order(struct rq *rq, int cpu)
+{
+ return rq->rq_order[cpu];
+}
+#else /* CONFIG_SMP */
+static inline void set_cpuidle_map(int cpu)
+{
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+}
+
+static inline bool suitable_idle_cpus(struct task_struct *p)
+{
+ return uprq->curr == uprq->idle;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+}
+
+static inline void resched_curr(struct rq *rq)
+{
+ resched_task(rq->curr);
+}
+
+static inline void resched_if_idle(struct rq *rq)
+{
+}
+
+static inline bool rq_local(struct rq *rq)
+{
+ return true;
+}
+
+static inline struct rq *rq_order(struct rq *rq, int cpu)
+{
+ return rq;
+}
+
+static inline bool smt_schedule(struct task_struct *p, struct rq *rq)
+{
+ return true;
+}
+#endif /* CONFIG_SMP */
+
+static inline int normal_prio(struct task_struct *p)
+{
+ if (has_rt_policy(p))
+ return MAX_RT_PRIO - 1 - p->rt_priority;
+ if (idleprio_task(p))
+ return IDLE_PRIO;
+ if (iso_task(p))
+ return ISO_PRIO;
+ return NORMAL_PRIO;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks as it will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+/*
+ * activate_task - move a task to the runqueue. Enter with rq locked.
+ */
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ resched_if_idle(rq);
+
+ /*
+ * Sleep time is in units of nanosecs, so shift by 20 to get a
+ * milliseconds-range estimation of the amount of time that the task
+ * spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ if (p->state == TASK_UNINTERRUPTIBLE)
+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+ (rq->niffies - p->last_ran) >> 20);
+ }
+
+ p->prio = effective_prio(p);
+ enqueue_task(rq, p, flags);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+}
+
+/*
+ * deactivate_task - If it's running, it's not on the runqueue and we can just
+ * decrement the nr_running. Enter with rq locked.
+ */
+static inline void deactivate_task(struct task_struct *p, struct rq *rq)
+{
+ p->on_rq = 0;
+ sched_info_dequeued(rq, p);
+ /* deactivate_task is always DEQUEUE_SLEEP in muqss */
+ psi_dequeue(p, DEQUEUE_SLEEP);
+}
+
+#ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+{
+ struct rq *rq;
+
+ if (task_cpu(p) == new_cpu)
+ return;
+
+ /* Do NOT call set_task_cpu on a currently queued task as we will not
+ * be reliably holding the rq lock after changing CPU. */
+ BUG_ON(task_queued(p));
+ rq = task_rq(p);
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * The caller should hold either p->pi_lock or rq->lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+ *
+ * Furthermore, all task_rq users should acquire both locks, see
+ * task_rq_lock().
+ */
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(rq->lock)));
+#endif
+
+ trace_sched_migrate_task(p, new_cpu);
+ rseq_migrate(p);
+ perf_event_task_migrate(p);
+
+ /*
+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+ * successfully executed on another CPU. We must ensure that updates of
+ * per-task data have been completed by this moment.
+ */
+ smp_wmb();
+
+ p->wake_cpu = new_cpu;
+
+ if (task_running(rq, p)) {
+ /*
+ * We should only be calling this on a running task if we're
+ * holding rq lock.
+ */
+ lockdep_assert_held(rq->lock);
+
+ /*
+ * We can't change the task_thread_info CPU on a running task
+ * as p will still be protected by the rq lock of the CPU it
+ * is still running on so we only set the wake_cpu for it to be
+ * lazily updated once off the CPU.
+ */
+ return;
+ }
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ WRITE_ONCE(p->cpu, new_cpu);
+#else
+ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu);
+#endif
+ /* We're no longer protecting p after this point since we're holding
+ * the wrong runqueue lock. */
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Move a task off the runqueue and take it to a cpu for it will
+ * become the running task.
+ */
+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
+{
+ struct rq *p_rq = task_rq(p);
+
+ dequeue_task(p_rq, p, DEQUEUE_SAVE);
+ if (p_rq != rq) {
+ sched_info_dequeued(p_rq, p);
+ sched_info_queued(rq, p);
+ }
+ set_task_cpu(p, cpu);
+}
+
+/*
+ * Returns a descheduling task to the runqueue unless it is being
+ * deactivated.
+ */
+static inline void return_task(struct task_struct *p, struct rq *rq,
+ int cpu, bool deactivate)
+{
+ if (deactivate)
+ deactivate_task(p, rq);
+ else {
+#ifdef CONFIG_SMP
+ /*
+ * set_task_cpu was called on the running task that doesn't
+ * want to deactivate so it has to be enqueued to a different
+ * CPU and we need its lock. Tag it to be moved with as the
+ * lock is dropped in finish_lock_switch.
+ */
+ if (unlikely(p->wake_cpu != cpu))
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
+ else
+#endif
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
+ }
+}
+
+/* Enter with rq lock held. We know p is on the local cpu */
+static inline void __set_tsk_resched(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ set_preempt_need_resched();
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+ return cpu_curr(task_cpu(p)) == p;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change. If it changes, i.e. @p might have woken up,
+ * then return zero. When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count). If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+ int running, queued;
+ struct rq_flags rf;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since this will return false
+ * if the runqueue has changed and p is actually now
+ * running somewhere else!
+ */
+ while (task_running(rq, p)) {
+ if (match_state && unlikely(p->state != match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &rf);
+ trace_sched_wait_task(p);
+ running = task_running(rq, p);
+ queued = task_on_rq_queued(p);
+ ncsw = 0;
+ if (!match_state || p->state == match_state)
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ task_rq_unlock(rq, p, &rf);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(queued)) {
+ ktime_t to = NSEC_PER_SEC / HZ;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = task_cpu(p);
+ if ((cpu != smp_processor_id()) && task_curr(p))
+ smp_sched_reschedule(cpu);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+#endif
+
+/*
+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
+ * between themselves, they cooperatively multitask. An idle rq scores as
+ * prio PRIO_LIMIT so it is always preempted.
+ */
+static inline bool
+can_preempt(struct task_struct *p, int prio, u64 deadline)
+{
+ /* Better static priority RT task or better policy preemption */
+ if (p->prio < prio)
+ return true;
+ if (p->prio > prio)
+ return false;
+ if (p->policy == SCHED_BATCH)
+ return false;
+ /* SCHED_NORMAL and ISO will preempt based on deadline */
+ if (!deadline_before(p->deadline, deadline))
+ return false;
+ return true;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
+ * __set_cpus_allowed_ptr().
+ */
+static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+{
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
+
+ if (!(p->flags & PF_KTHREAD))
+ return cpu_active(cpu);
+
+ /* KTHREAD_IS_PER_CPU is always allowed. */
+ if (kthread_is_per_cpu(p))
+ return cpu_online(cpu);
+
+ /* But are allowed during online. */
+ return cpu_online(cpu);
+}
+
+/*
+ * Check to see if p can run on cpu, and if not, whether there are any online
+ * CPUs it can run on instead. This only happens with the hotplug threads that
+ * bring up the CPUs.
+ */
+static inline bool sched_other_cpu(struct task_struct *p, int cpu)
+{
+ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr)))
+ return false;
+ if (p->nr_cpus_allowed == 1) {
+ cpumask_t valid_mask;
+
+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask);
+ if (unlikely(cpumask_empty(&valid_mask)))
+ return false;
+ }
+ return true;
+}
+
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+ if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
+ return true;
+}
+
+#define cpu_online_map (*(cpumask_t *)cpu_online_mask)
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+ int i, this_entries = rq_load(this_rq);
+ cpumask_t tmp;
+
+ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p)))
+ return;
+
+ /* IDLEPRIO tasks never preempt anything but idle */
+ if (p->policy == SCHED_IDLEPRIO)
+ return;
+
+ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr);
+
+ for (i = 0; i < num_online_cpus(); i++) {
+ struct rq *rq = this_rq->cpu_order[i];
+
+ if (!cpumask_test_cpu(rq->cpu, &tmp))
+ continue;
+
+ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries)
+ continue;
+ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) {
+ /* We set rq->preempting lockless, it's a hint only */
+ rq->preempting = p;
+ resched_curr(rq);
+ return;
+ }
+ }
+}
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask,
+ u32 flags);
+#else /* CONFIG_SMP */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+ return false;
+}
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+ if (p->policy == SCHED_IDLEPRIO)
+ return;
+ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
+ resched_curr(uprq);
+}
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask,
+ u32 __always_unused flags)
+{
+ return set_cpus_allowed_ptr(p, new_mask);
+}
+#endif /* CONFIG_SMP */
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+ struct rq *rq;
+
+ if (!schedstat_enabled())
+ return;
+
+ rq = this_rq();
+
+#ifdef CONFIG_SMP
+ if (cpu == rq->cpu) {
+ __schedstat_inc(rq->ttwu_local);
+ } else {
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ for_each_domain(rq->cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ __schedstat_inc(sd->ttwu_wake_remote);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+#endif /* CONFIG_SMP */
+
+ __schedstat_inc(rq->ttwu_count);
+}
+
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ /*
+ * Sync wakeups (i.e. those types of wakeups where the waker
+ * has indicated that it will leave the CPU in short order)
+ * don't trigger a preemption if there are no idle cpus,
+ * instead waiting for current to deschedule.
+ */
+ if (wake_flags & WF_SYNC)
+ resched_suitable_idle(p);
+ else
+ try_preempt(p, rq);
+ p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ int en_flags = ENQUEUE_WAKEUP;
+
+ lockdep_assert_held(rq->lock);
+
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+#ifdef CONFIG_SMP
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
+ else
+#endif
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ activate_task(rq, p, en_flags);
+ ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Consider @p being inside a wait loop:
+ *
+ * for (;;) {
+ * set_current_state(TASK_UNINTERRUPTIBLE);
+ *
+ * if (CONDITION)
+ * break;
+ *
+ * schedule();
+ * }
+ * __set_current_state(TASK_RUNNING);
+ *
+ * between set_current_state() and schedule(). In this case @p is still
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
+ * an atomic manner.
+ *
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
+ * then schedule() must still happen and p->state can be changed to
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
+ * need to do a full wakeup with enqueue.
+ *
+ * Returns: %true when the wakeup is done,
+ * %false otherwise.
+ */
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p, NULL);
+ if (likely(task_on_rq_queued(p))) {
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq, NULL);
+
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+void sched_ttwu_pending(void *arg)
+{
+ struct llist_node *llist = arg;
+ struct rq *rq = this_rq();
+ struct task_struct *p, *t;
+ struct rq_flags rf;
+
+ if (!llist)
+ return;
+
+ /*
+ * rq::ttwu_pending racy indication of out-standing wakeups.
+ * Races such that false-negatives are possible, since they
+ * are shorter lived that false-positives would be.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
+
+ rq_lock_irqsave(rq, &rf);
+
+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
+ if (WARN_ON_ONCE(p->on_cpu))
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
+ set_task_cpu(p, cpu_of(rq));
+
+ ttwu_do_activate(rq, p, 0);
+ }
+
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+void send_call_function_single_ipi(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!set_nr_if_polling(rq->idle))
+ arch_send_call_function_single_ipi(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+ * of the wakeup instead of the waker.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ WRITE_ONCE(rq->ttwu_pending, 1);
+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
+}
+
+void wake_up_if_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
+
+ if (set_nr_if_polling(rq->idle)) {
+ trace_sched_wake_idle_without_ipi(cpu);
+ } else {
+ rq_lock_irqsave(rq, &rf);
+ if (likely(is_idle_task(rq->curr)))
+ smp_sched_reschedule(cpu);
+ /* Else cpu is not in idle, do nothing here */
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+out:
+ rcu_read_unlock();
+}
+
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+{
+ /*
+ * Do not complicate things with the async wake_list while the CPU is
+ * in hotplug state.
+ */
+ if (!cpu_active(cpu))
+ return false;
+
+ /*
+ * If the CPU does not share cache, then queue the task on the
+ * remote rqs wakelist to avoid accessing remote data.
+ */
+ if (!cpus_share_cache(smp_processor_id(), cpu))
+ return true;
+
+ /*
+ * If the task is descheduling and the only running task on the
+ * CPU then use the wakelist to offload the task activation to
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
+ * nr_running is checked to avoid unnecessary task stacking.
+ */
+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ return true;
+
+ return false;
+}
+
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ /* CFS would require sched_feat(TTWU_QUEUE) here but that is
+ * fixed enabled */
+ if (ttwu_queue_cond(cpu, wake_flags)) {
+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
+ return false;
+
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
+ return true;
+ }
+
+ return false;
+}
+
+static int valid_task_cpu(struct task_struct *p)
+{
+ cpumask_t valid_mask;
+
+ if (p->flags & PF_KTHREAD)
+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask);
+ else
+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask);
+
+ if (unlikely(!cpumask_weight(&valid_mask))) {
+ /* We shouldn't be hitting this any more */
+ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm,
+ p->pid, cpumask_weight(p->cpus_ptr));
+ return cpumask_any(p->cpus_ptr);
+ }
+ return cpumask_any(&valid_mask);
+}
+
+/*
+ * For a task that's just being woken up we have a valuable balancing
+ * opportunity so choose the nearest cache most lightly loaded runqueue.
+ * Entered with rq locked and returns with the chosen runqueue locked.
+ */
+static inline int select_best_cpu(struct task_struct *p)
+{
+ unsigned int idlest = ~0U;
+ struct rq *rq = NULL;
+ int i;
+
+ if (suitable_idle_cpus(p)) {
+ int cpu = task_cpu(p);
+
+ if (unlikely(needs_other_cpu(p, cpu)))
+ cpu = valid_task_cpu(p);
+ rq = resched_best_idle(p, cpu);
+ if (likely(rq))
+ return rq->cpu;
+ }
+
+ for (i = 0; i < num_online_cpus(); i++) {
+ struct rq *other_rq = task_rq(p)->cpu_order[i];
+ int entries;
+
+ if (!other_rq->online)
+ continue;
+ if (needs_other_cpu(p, other_rq->cpu))
+ continue;
+ entries = rq_load(other_rq);
+ if (entries >= idlest)
+ continue;
+ idlest = entries;
+ rq = other_rq;
+ }
+ if (unlikely(!rq))
+ return task_cpu(p);
+ return rq->cpu;
+}
+#else /* CONFIG_SMP */
+
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ return false;
+}
+
+static int valid_task_cpu(struct task_struct *p)
+{
+ return 0;
+}
+
+static inline int select_best_cpu(struct task_struct *p)
+{
+ return 0;
+}
+
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
+{
+ return NULL;
+}
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
+ return;
+
+ rq_lock(rq);
+ update_rq_clock(rq);
+ ttwu_do_activate(rq, p, wake_flags);
+ rq_unlock(rq);
+}
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * Return: %true if @p was woken up, %false if it was already running.
+ * or @state didn't match @p's state.
+ */
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+{
+ unsigned long flags;
+ int cpu, success = 0;
+
+ preempt_disable();
+ if (p == current) {
+ /*
+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
+ * == smp_processor_id()'. Together this means we can special
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
+ * without taking any locks.
+ *
+ * In particular:
+ * - we rely on Program-Order guarantees for all the ordering,
+ * - we're serialized against set_special_state() by virtue of
+ * it disabling IRQs (this allows not taking ->pi_lock).
+ */
+ if (!(p->state & state))
+ goto out;
+
+ success = 1;
+ trace_sched_waking(p);
+ p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+ goto out;
+ }
+
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with smp_store_mb()
+ * in set_current_state() that the waiting thread does.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ smp_mb__after_spinlock();
+ if (!(p->state & state))
+ goto unlock;
+
+ trace_sched_waking(p);
+
+ /* We're going to change ->state: */
+ success = 1;
+
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * STORE p->on_rq = 1 LOAD p->state
+ * UNLOCK rq->lock
+ *
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ */
+ smp_rmb();
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
+ goto unlock;
+
+#ifdef CONFIG_SMP
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * __schedule() (switch to task 'p') try_to_wake_up()
+ * STORE p->on_cpu = 1 LOAD p->on_rq
+ * UNLOCK rq->lock
+ *
+ * __schedule() (put 'p' to sleep)
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * STORE p->on_rq = 0 LOAD p->on_cpu
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * care about it's own p->state. See the comment in __schedule().
+ */
+ smp_acquire__after_ctrl_dep();
+
+ /*
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
+ * == 0), which means we need to do an enqueue, change p->state to
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
+ * enqueue, such as ttwu_queue_wakelist().
+ */
+ p->state = TASK_WAKING;
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
+ */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+ goto unlock;
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, wait until it's done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_task().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ cpu = select_best_cpu(p);
+ if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
+ set_task_cpu(p, cpu);
+ }
+
+#else
+ cpu = task_cpu(p);
+#endif /* CONFIG_SMP */
+
+ ttwu_queue(p, cpu, wake_flags);
+unlock:
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+out:
+ if (success)
+ ttwu_stat(p, task_cpu(p), wake_flags);
+ preempt_enable();
+
+ return success;
+}
+
+/**
+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
+ * @p: Process for which the function is to be invoked, can be @current.
+ * @func: Function to invoke.
+ * @arg: Argument to function.
+ *
+ * If the specified task can be quickly locked into a definite state
+ * (either sleeping or on a given runqueue), arrange to keep it in that
+ * state while invoking @func(@arg). This function can use ->on_rq and
+ * task_curr() to work out what the state is, if required. Given that
+ * @func can be invoked with a runqueue lock held, it had better be quite
+ * lightweight.
+ *
+ * Returns:
+ * @false if the task slipped out from under the locks.
+ * @true if the task was locked onto a runqueue or is sleeping.
+ * However, @func can override this by returning @false.
+ */
+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
+{
+ struct rq_flags rf;
+ bool ret = false;
+ struct rq *rq;
+
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ if (p->on_rq) {
+ rq = __task_rq_lock(p, NULL);
+ if (task_rq(p) == rq)
+ ret = func(p, arg);
+ rq_unlock(rq);
+ } else {
+ switch (p->state) {
+ case TASK_RUNNING:
+ case TASK_WAKING:
+ break;
+ default:
+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
+ if (!p->on_rq)
+ ret = func(p, arg);
+ }
+ }
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+ return ret;
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * This function executes a full memory barrier before accessing the task state.
+ */
+int wake_up_process(struct task_struct *p)
+{
+ return try_to_wake_up(p, TASK_NORMAL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+ return try_to_wake_up(p, state, 0);
+}
+
+static void time_slice_expired(struct task_struct *p, struct rq *rq);
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
+{
+ unsigned long flags;
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+
+#ifdef CONFIG_COMPACTION
+ p->capture_control = NULL;
+#endif
+
+#ifdef CONFIG_SMP
+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
+#endif
+ /*
+ * We mark the process as NEW here. This guarantees that
+ * nobody will actually run it, and a signal or other external
+ * event cannot wake it up and insert it on the runqueue either.
+ */
+ p->state = TASK_NEW;
+
+ /*
+ * The process state is set to the same value of the process executing
+ * do_fork() code. That is running. This guarantees that nobody will
+ * actually run it, and a signal or other external event cannot wake
+ * it up and insert it on the runqueue either.
+ */
+
+ /* Should be reset in fork.c but done here for ease of MuQSS patching */
+ p->on_cpu =
+ p->on_rq =
+ p->utime =
+ p->stime =
+ p->sched_time =
+ p->stime_ns =
+ p->utime_ns = 0;
+ skiplist_node_init(&p->node);
+
+ /*
+ * Revert to default priority/policy on fork if requested.
+ */
+ if (unlikely(p->sched_reset_on_fork)) {
+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || p-> policy == SCHED_ISO) {
+ p->policy = SCHED_NORMAL;
+ p->normal_prio = normal_prio(p);
+ }
+
+ if (PRIO_TO_NICE(p->static_prio) < 0) {
+ p->static_prio = NICE_TO_PRIO(0);
+ p->normal_prio = p->static_prio;
+ }
+
+ /*
+ * We don't need the reset flag anymore after the fork. It has
+ * fulfilled its duty:
+ */
+ p->sched_reset_on_fork = 0;
+ }
+
+ /*
+ * Silence PROVE_RCU.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ rseq_migrate(p);
+ set_task_cpu(p, smp_processor_id());
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+#ifdef CONFIG_SCHED_INFO
+ if (unlikely(sched_info_on()))
+ memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+ init_task_preempt_count(p);
+
+ return 0;
+}
+
+void sched_post_fork(struct task_struct *p)
+{
+}
+
+#ifdef CONFIG_SCHEDSTATS
+
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
+
+static void set_schedstats(bool enabled)
+{
+ if (enabled)
+ static_branch_enable(&sched_schedstats);
+ else
+ static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+ if (!schedstat_enabled()) {
+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+ static_branch_enable(&sched_schedstats);
+ }
+}
+
+static int __init setup_schedstats(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+
+ /*
+ * This code is called before jump labels have been set up, so we can't
+ * change the static branch directly just yet. Instead set a temporary
+ * variable so init_schedstats() can do it later.
+ */
+ if (!strcmp(str, "enable")) {
+ __sched_schedstats = true;
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ __sched_schedstats = false;
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("Unable to parse schedstats=\n");
+
+ return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+static void __init init_schedstats(void)
+{
+ set_schedstats(__sched_schedstats);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_schedstats);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_schedstats(state);
+ return err;
+}
+#endif /* CONFIG_PROC_SYSCTL */
+#else /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
+
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p);
+
+static void account_task_cpu(struct rq *rq, struct task_struct *p)
+{
+ update_clocks(rq);
+ /* This isn't really a context switch but accounting is the same */
+ update_cpu_clock_switch(rq, p);
+ p->last_ran = rq->niffies;
+}
+
+bool sched_smp_initialized __read_mostly;
+
+static inline int hrexpiry_enabled(struct rq *rq)
+{
+ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized))
+ return 0;
+ return hrtimer_is_hres_active(&rq->hrexpiry_timer);
+}
+
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ */
+static inline void hrexpiry_clear(struct rq *rq)
+{
+ if (!hrexpiry_enabled(rq))
+ return;
+ if (hrtimer_active(&rq->hrexpiry_timer))
+ hrtimer_cancel(&rq->hrexpiry_timer);
+}
+
+/*
+ * High-resolution time_slice expiry.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrexpiry(struct hrtimer *timer)
+{
+ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer);
+ struct task_struct *p;
+
+ /* This can happen during CPU hotplug / resume */
+ if (unlikely(cpu_of(rq) != smp_processor_id()))
+ goto out;
+
+ /*
+ * We're doing this without the runqueue lock but this should always
+ * be run on the local CPU. Time slice should run out in __schedule
+ * but we set it to zero here in case niffies is slightly less.
+ */
+ p = rq->curr;
+ p->time_slice = 0;
+ __set_tsk_resched(p);
+out:
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Called to set the hrexpiry timer state.
+ *
+ * called with irqs disabled from the local CPU only
+ */
+static void hrexpiry_start(struct rq *rq, u64 delay)
+{
+ if (!hrexpiry_enabled(rq))
+ return;
+
+ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void init_rq_hrexpiry(struct rq *rq)
+{
+ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rq->hrexpiry_timer.function = hrexpiry;
+}
+
+static inline int rq_dither(struct rq *rq)
+{
+ if (!hrexpiry_enabled(rq))
+ return HALF_JIFFY_US;
+ return 0;
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+ struct task_struct *parent, *rq_curr;
+ struct rq *rq, *new_rq;
+ unsigned long flags;
+
+ parent = p->parent;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ p->state = TASK_RUNNING;
+ /* Task_rq can't change yet on a new task */
+ new_rq = rq = task_rq(p);
+ if (unlikely(needs_other_cpu(p, task_cpu(p)))) {
+ set_task_cpu(p, valid_task_cpu(p));
+ new_rq = task_rq(p);
+ }
+
+ double_rq_lock(rq, new_rq);
+ rq_curr = rq->curr;
+
+ /*
+ * Make sure we do not leak PI boosting priority to the child.
+ */
+ p->prio = rq_curr->normal_prio;
+
+ trace_sched_wakeup_new(p);
+
+ /*
+ * Share the timeslice between parent and child, thus the
+ * total amount of pending timeslices in the system doesn't change,
+ * resulting in more scheduling fairness. If it's negative, it won't
+ * matter since that's the same as being 0. rq->rq_deadline is only
+ * modified within schedule() so it is always equal to
+ * current->deadline.
+ */
+ account_task_cpu(rq, rq_curr);
+ p->last_ran = rq_curr->last_ran;
+ if (likely(rq_curr->policy != SCHED_FIFO)) {
+ rq_curr->time_slice /= 2;
+ if (rq_curr->time_slice < RESCHED_US) {
+ /*
+ * Forking task has run out of timeslice. Reschedule it and
+ * start its child with a new time slice and deadline. The
+ * child will end up running first because its deadline will
+ * be slightly earlier.
+ */
+ __set_tsk_resched(rq_curr);
+ time_slice_expired(p, new_rq);
+ if (suitable_idle_cpus(p))
+ resched_best_idle(p, task_cpu(p));
+ else if (unlikely(rq != new_rq))
+ try_preempt(p, new_rq);
+ } else {
+ p->time_slice = rq_curr->time_slice;
+ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) {
+ /*
+ * The VM isn't cloned, so we're in a good position to
+ * do child-runs-first in anticipation of an exec. This
+ * usually avoids a lot of COW overhead.
+ */
+ __set_tsk_resched(rq_curr);
+ } else {
+ /*
+ * Adjust the hrexpiry since rq_curr will keep
+ * running and its timeslice has been shortened.
+ */
+ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice));
+ try_preempt(p, new_rq);
+ }
+ }
+ } else {
+ time_slice_expired(p, new_rq);
+ try_preempt(p, new_rq);
+ }
+ activate_task(new_rq, p, 0);
+ double_rq_unlock(rq, new_rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
+
+void preempt_notifier_inc(void)
+{
+ static_branch_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+ static_branch_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+ if (!static_branch_unlikely(&preempt_notifier_key))
+ WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
+ hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is *not* safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+ hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ struct preempt_notifier *notifier;
+
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+ notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ if (static_branch_unlikely(&preempt_notifier_key))
+ __fire_sched_in_preempt_notifiers(curr);
+}
+
+static void
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ struct preempt_notifier *notifier;
+
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+ notifier->ops->sched_out(notifier, next);
+}
+
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ if (static_branch_unlikely(&preempt_notifier_key))
+ __fire_sched_out_preempt_notifiers(curr, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+static inline void prepare_task(struct task_struct *next)
+{
+ /*
+ * Claim the task as running, we do this before switching to it
+ * such that any running task will have this set.
+ *
+ * See the ttwu() WF_ON_CPU case and its ordering comment.
+ */
+ WRITE_ONCE(next->on_cpu, 1);
+}
+
+static inline void finish_task(struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * This must be the very last reference to @prev from this CPU. After
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
+ * must ensure this doesn't happen until the switch is completely
+ * finished.
+ *
+ * In particular, the load of prev->state in finish_task_switch() must
+ * happen before this.
+ *
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+ */
+ smp_store_release(&prev->on_cpu, 0);
+#endif
+}
+
+static inline void
+prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+ spin_release(&rq->lock->dep_map, _THIS_IP_);
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock->owner = next;
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_);
+
+#ifdef CONFIG_SMP
+ /*
+ * If prev was marked as migrating to another CPU in return_task, drop
+ * the local runqueue lock but leave interrupts disabled and grab the
+ * remote lock we're migrating it to before enabling them.
+ */
+ if (unlikely(task_on_rq_migrating(prev))) {
+ sched_info_dequeued(rq, prev);
+ /*
+ * We move the ownership of prev to the new cpu now. ttwu can't
+ * activate prev to the wrong cpu since it has to grab this
+ * runqueue in ttwu_remote.
+ */
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ prev->cpu = prev->wake_cpu;
+#else
+ task_thread_info(prev)->cpu = prev->wake_cpu;
+#endif
+ raw_spin_unlock(rq->lock);
+
+ raw_spin_lock(&prev->pi_lock);
+ rq = __task_rq_lock(prev, NULL);
+ /* Check that someone else hasn't already queued prev */
+ if (likely(!task_queued(prev))) {
+ enqueue_task(rq, prev, 0);
+ prev->on_rq = TASK_ON_RQ_QUEUED;
+ /* Wake up the CPU if it's not already running */
+ resched_if_idle(rq);
+ }
+ raw_spin_unlock(&prev->pi_lock);
+ }
+#endif
+ raw_spin_unlock_irq(rq->lock);
+}
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev) do { } while (0)
+#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
+
+static inline void kmap_local_sched_out(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+ if (unlikely(current->kmap_ctrl.idx))
+ __kmap_local_sched_out();
+#endif
+}
+
+static inline void kmap_local_sched_in(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+ if (unlikely(current->kmap_ctrl.idx))
+ __kmap_local_sched_in();
+#endif
+}
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ kcov_prepare_switch(prev);
+ sched_info_switch(rq, prev, next);
+ perf_event_task_sched_out(prev, next);
+ rseq_preempt(prev);
+ fire_sched_out_preempt_notifiers(prev, next);
+ kmap_local_sched_out();
+ prepare_task(next);
+ prepare_arch_switch(next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock. (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
+ */
+static void finish_task_switch(struct task_struct *prev)
+ __releases(rq->lock)
+{
+ struct rq *rq = this_rq();
+ struct mm_struct *mm = rq->prev_mm;
+ long prev_state;
+
+ /*
+ * The previous task will have left us with a preempt_count of 2
+ * because it left us after:
+ *
+ * schedule()
+ * preempt_disable(); // 1
+ * __schedule()
+ * raw_spin_lock_irq(rq->lock) // 2
+ *
+ * Also, see FORK_PREEMPT_COUNT.
+ */
+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+ "corrupted preempt_count: %s/%d/0x%x\n",
+ current->comm, current->pid, preempt_count()))
+ preempt_count_set(FORK_PREEMPT_COUNT);
+
+ rq->prev_mm = NULL;
+
+ /*
+ * A task struct has one reference for the use as "current".
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+ * schedule one last time. The schedule call will never return, and
+ * the scheduled task must drop that reference.
+ *
+ * We must observe prev->state before clearing prev->on_cpu (in
+ * finish_task), otherwise a concurrent wakeup can get prev
+ * running on another CPU and we could rave with its RUNNING -> DEAD
+ * transition, resulting in a double drop.
+ */
+ prev_state = prev->state;
+ vtime_task_switch(prev);
+ perf_event_task_sched_in(prev, current);
+ finish_task(prev);
+ finish_lock_switch(rq, prev);
+ finish_arch_post_lock_switch();
+ kcov_finish_switch(current);
+ /*
+ * kmap_local_sched_out() is invoked with rq::lock held and
+ * interrupts disabled. There is no requirement for that, but the
+ * sched out code does not have an interrupt enabled section.
+ * Restoring the maps on sched in does not require interrupts being
+ * disabled either.
+ */
+ kmap_local_sched_in();
+
+ fire_sched_in_preempt_notifiers(current);
+ /*
+ * When switching through a kernel thread, the loop in
+ * membarrier_{private,global}_expedited() may have observed that
+ * kernel thread and not issued an IPI. It is therefore possible to
+ * schedule between user->kernel->user threads without passing though
+ * switch_mm(). Membarrier requires a barrier after storing to
+ * rq->curr, before returning to userspace, so provide them here:
+ *
+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+ * provided by mmdrop(),
+ * - a sync_core for SYNC_CORE.
+ */
+ if (mm) {
+ membarrier_mm_sync_core_before_usermode(mm);
+ mmdrop(mm);
+ }
+ if (unlikely(prev_state == TASK_DEAD)) {
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);
+
+ /* Task is done with its stack. */
+ put_task_stack(prev);
+
+ put_task_struct_rcu_user(prev);
+ }
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
+{
+ /*
+ * New tasks start with FORK_PREEMPT_COUNT, see there and
+ * finish_task_switch() for details.
+ *
+ * finish_task_switch() will drop rq->lock() and lower preempt_count
+ * and the preempt_enable() will end up enabling preemption (on
+ * PREEMPT_COUNT kernels).
+ */
+
+ finish_task_switch(prev);
+ preempt_enable();
+
+ if (current->set_child_tid)
+ put_user(task_pid_vnr(current), current->set_child_tid);
+
+ calculate_sigpending();
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline void
+context_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ prepare_task_switch(rq, prev, next);
+
+ /*
+ * For paravirt, this is coupled with an exit in switch_to to
+ * combine the page table reload and the switch backend into
+ * one hypercall.
+ */
+ arch_start_context_switch(prev);
+
+ /*
+ * kernel -> kernel lazy + transfer active
+ * user -> kernel lazy + mmgrab() active
+ *
+ * kernel -> user switch + mmdrop() active
+ * user -> user switch
+ */
+ if (!next->mm) { // to kernel
+ enter_lazy_tlb(prev->active_mm, next);
+
+ next->active_mm = prev->active_mm;
+ if (prev->mm) // from user
+ mmgrab(prev->active_mm);
+ else
+ prev->active_mm = NULL;
+ } else { // to user
+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
+ /*
+ * sys_membarrier() requires an smp_mb() between setting
+ * rq->curr / membarrier_switch_mm() and returning to userspace.
+ *
+ * The below provides this either through switch_mm(), or in
+ * case 'prev->active_mm == next->mm' through
+ * finish_task_switch()'s mmdrop().
+ */
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
+
+ if (!prev->mm) { // from kernel
+ /* will mmdrop() in finish_task_switch(). */
+ rq->prev_mm = prev->active_mm;
+ prev->active_mm = NULL;
+ }
+ }
+ prepare_lock_switch(rq, next);
+
+ /* Here we just switch the register state and the stack. */
+ switch_to(prev, next, prev);
+ barrier();
+
+ finish_task_switch(prev);
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_online_cpu(i)
+ sum += cpu_rq(i)->nr_running;
+
+ return sum;
+}
+
+static unsigned long nr_uninterruptible(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_online_cpu(i)
+ sum += cpu_rq(i)->nr_uninterruptible;
+
+ return sum;
+}
+
+/*
+ * Check if only the current task is running on the CPU.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race. The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptible section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
+ */
+bool single_task_running(void)
+{
+ if (rq_load(raw_rq()) == 1)
+ return true;
+ else
+ return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
+unsigned long long nr_context_switches(void)
+{
+ int cpu;
+ unsigned long long sum = 0;
+
+ for_each_possible_cpu(cpu)
+ sum += cpu_rq(cpu)->nr_switches;
+
+ return sum;
+}
+
+/*
+ * Consumers of these two interfaces, like for example the cpufreq menu
+ * governor are using nonsensical data. Boosting frequency for a CPU that has
+ * IO-wait which might not even end up running the task when it does become
+ * runnable.
+ */
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
+}
+
+/*
+ * IO-wait accounting, and how it's mostly bollocks (on SMP).
+ *
+ * The idea behind IO-wait account is to account the idle time that we could
+ * have spend running if it were not for IO. That is, if we were to improve the
+ * storage performance, we'd have a proportional reduction in IO-wait time.
+ *
+ * This all works nicely on UP, where, when a task blocks on IO, we account
+ * idle time as IO-wait, because if the storage were faster, it could've been
+ * running and we'd not be idle.
+ *
+ * This has been extended to SMP, by doing the same for each CPU. This however
+ * is broken.
+ *
+ * Imagine for instance the case where two tasks block on one CPU, only the one
+ * CPU will have IO-wait accounted, while the other has regular idle. Even
+ * though, if the storage were faster, both could've ran at the same time,
+ * utilising both CPUs.
+ *
+ * This means, that when looking globally, the current IO-wait accounting on
+ * SMP is a lower bound, by reason of under accounting.
+ *
+ * Worse, since the numbers are provided per CPU, they are sometimes
+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
+ * associated with any one particular CPU, it can wake to another CPU than it
+ * blocked on. This means the per CPU IO-wait number is meaningless.
+ *
+ * Task CPU affinities can make all that even more 'interesting'.
+ */
+
+unsigned long nr_iowait(void)
+{
+ unsigned long cpu, sum = 0;
+
+ for_each_possible_cpu(cpu)
+ sum += nr_iowait_cpu(cpu);
+
+ return sum;
+}
+
+unsigned long nr_active(void)
+{
+ return nr_running() + nr_uninterruptible();
+}
+
+/* Variables and functions for calc_load */
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+/*
+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
+ */
+void calc_global_load(void)
+{
+ long active;
+
+ if (time_before(jiffies, READ_ONCE(calc_load_update)))
+ return;
+ active = nr_active() * FIXED_1;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ calc_load_update = jiffies + LOAD_FREQ;
+}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) {
+ for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+ if (unlikely(steal > NSEC_PER_SEC))
+ return div_u64(steal, TICK_NSEC);
+
+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
+#endif
+
+/*
+ * On each tick, add the number of nanoseconds to the unbanked variables and
+ * once one tick's worth has accumulated, account it allowing for accurate
+ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we
+ * deduct nanoseconds.
+ */
+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long ticks;
+
+ if (atomic_read(&rq->nr_iowait) > 0) {
+ rq->iowait_ns += ns;
+ if (rq->iowait_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->iowait_ns);
+ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks;
+ rq->iowait_ns %= JIFFY_NS;
+ }
+ } else {
+ rq->idle_ns += ns;
+ if (rq->idle_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->idle_ns);
+ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks;
+ rq->idle_ns %= JIFFY_NS;
+ }
+ }
+ acct_update_integrals(idle);
+}
+
+static void pc_system_time(struct rq *rq, struct task_struct *p,
+ int hardirq_offset, unsigned long ns)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long ticks;
+
+ p->stime_ns += ns;
+ if (p->stime_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(p->stime_ns);
+ p->stime_ns %= JIFFY_NS;
+ p->stime += (__force u64)TICK_APPROX_NS * ticks;
+ account_group_system_time(p, TICK_APPROX_NS * ticks);
+ }
+ p->sched_time += ns;
+ account_group_exec_runtime(p, ns);
+
+ if (hardirq_count() - hardirq_offset) {
+ rq->irq_ns += ns;
+ if (rq->irq_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->irq_ns);
+ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks;
+ rq->irq_ns %= JIFFY_NS;
+ }
+ } else if (in_serving_softirq()) {
+ rq->softirq_ns += ns;
+ if (rq->softirq_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->softirq_ns);
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks;
+ rq->softirq_ns %= JIFFY_NS;
+ }
+ } else {
+ rq->system_ns += ns;
+ if (rq->system_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->system_ns);
+ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks;
+ rq->system_ns %= JIFFY_NS;
+ }
+ }
+ acct_update_integrals(p);
+}
+
+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long ticks;
+
+ p->utime_ns += ns;
+ if (p->utime_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(p->utime_ns);
+ p->utime_ns %= JIFFY_NS;
+ p->utime += (__force u64)TICK_APPROX_NS * ticks;
+ account_group_user_time(p, TICK_APPROX_NS * ticks);
+ }
+ p->sched_time += ns;
+ account_group_exec_runtime(p, ns);
+
+ if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ */
+ rq->softirq_ns += ns;
+ if (rq->softirq_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->softirq_ns);
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks;
+ rq->softirq_ns %= JIFFY_NS;
+ }
+ }
+
+ if (task_nice(p) > 0 || idleprio_task(p)) {
+ rq->nice_ns += ns;
+ if (rq->nice_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->nice_ns);
+ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks;
+ rq->nice_ns %= JIFFY_NS;
+ }
+ } else {
+ rq->user_ns += ns;
+ if (rq->user_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->user_ns);
+ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks;
+ rq->user_ns %= JIFFY_NS;
+ }
+ }
+ acct_update_integrals(p);
+}
+
+/*
+ * This is called on clock ticks.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
+{
+ s64 account_ns = rq->niffies - p->last_ran;
+ struct task_struct *idle = rq->idle;
+
+ /* Accurate tick timekeeping */
+ if (user_mode(get_irq_regs()))
+ pc_user_time(rq, p, account_ns);
+ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) {
+ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns);
+ } else
+ pc_idle_time(rq, idle, account_ns);
+
+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */
+ if (p->policy != SCHED_FIFO && p != idle)
+ p->time_slice -= NS_TO_US(account_ns);
+
+ p->last_ran = rq->niffies;
+}
+
+/*
+ * This is called on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
+{
+ s64 account_ns = rq->niffies - p->last_ran;
+ struct task_struct *idle = rq->idle;
+
+ /* Accurate subtick timekeeping */
+ if (p != idle)
+ pc_user_time(rq, p, account_ns);
+ else
+ pc_idle_time(rq, idle, account_ns);
+
+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */
+ if (p->policy != SCHED_FIFO && p != idle)
+ p->time_slice -= NS_TO_US(account_ns);
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been accounted in
+ * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock(p) held.
+ */
+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+ u64 ns = 0;
+
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (p == rq->curr && task_on_rq_queued(p)) {
+ update_clocks(rq);
+ ns = rq->niffies - p->last_ran;
+ }
+
+ return ns;
+}
+
+/*
+ * Return accounted runtime for the task.
+ * Return separately the current's pending runtime that have not been
+ * accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+ u64 ns;
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+ /*
+ * 64-bit doesn't need locks to atomically read a 64-bit value.
+ * So we have a optimisation chance when the task's delta_exec is 0.
+ * Reading ->on_cpu is racy, but this is ok.
+ *
+ * If we race with it leaving CPU, we'll take a lock. So we're correct.
+ * If we race with it entering CPU, unaccounted time is 0. This is
+ * indistinguishable from the read occurring a few cycles earlier.
+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+ * been accounted, so we're correct here as well.
+ */
+ if (!p->on_cpu || !task_on_rq_queued(p))
+ return tsk_seruntime(p);
+#endif
+
+ rq = task_rq_lock(p, &rf);
+ ns = p->sched_time + do_task_delta_exec(p, rq);
+ task_rq_unlock(rq, p, &rf);
+
+ return ns;
+}
+
+/*
+ * Functions to test for when SCHED_ISO tasks have used their allocated
+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All
+ * data is modified only by the local runqueue during scheduler_tick with
+ * interrupts disabled.
+ */
+
+/*
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
+ * slow division.
+ */
+static inline void iso_tick(struct rq *rq)
+{
+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
+ rq->iso_ticks += 100;
+ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) {
+ rq->iso_refractory = true;
+ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100))
+ rq->iso_ticks = ISO_PERIOD * 100;
+ }
+}
+
+/* No SCHED_ISO task was running so decrease rq->iso_ticks */
+static inline void no_iso_tick(struct rq *rq, int ticks)
+{
+ if (rq->iso_ticks > 0 || rq->iso_refractory) {
+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD;
+ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) {
+ rq->iso_refractory = false;
+ if (unlikely(rq->iso_ticks < 0))
+ rq->iso_ticks = 0;
+ }
+ }
+}
+
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
+static void task_running_tick(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ /*
+ * If a SCHED_ISO task is running we increment the iso_ticks. In
+ * order to prevent SCHED_ISO tasks from causing starvation in the
+ * presence of true RT tasks we account those as iso_ticks as well.
+ */
+ if (rt_task(p) || task_running_iso(p))
+ iso_tick(rq);
+ else
+ no_iso_tick(rq, 1);
+
+ /* SCHED_FIFO tasks never run out of timeslice. */
+ if (p->policy == SCHED_FIFO)
+ return;
+
+ if (iso_task(p)) {
+ if (task_running_iso(p)) {
+ if (rq->iso_refractory) {
+ /*
+ * SCHED_ISO task is running as RT and limit
+ * has been hit. Force it to reschedule as
+ * SCHED_NORMAL by zeroing its time_slice
+ */
+ p->time_slice = 0;
+ }
+ } else if (!rq->iso_refractory) {
+ /* Can now run again ISO. Reschedule to pick up prio */
+ goto out_resched;
+ }
+ }
+
+ /*
+ * Tasks that were scheduled in the first half of a tick are not
+ * allowed to run into the 2nd half of the next tick if they will
+ * run out of time slice in the interim. Otherwise, if they have
+ * less than RESCHED_US μs of time slice left they will be rescheduled.
+ * Dither is used as a backup for when hrexpiry is disabled or high res
+ * timers not configured in.
+ */
+ if (p->time_slice - rq->dither >= RESCHED_US)
+ return;
+out_resched:
+ rq_lock(rq);
+ __set_tsk_resched(p);
+ rq_unlock(rq);
+}
+
+static inline void task_tick(struct rq *rq)
+{
+ if (!rq_idle(rq))
+ task_running_tick(rq);
+ else if (rq->last_jiffy > rq->last_scheduler_tick)
+ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick);
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * We can stop the timer tick any time highres timers are active since
+ * we rely entirely on highres timeouts for task expiry rescheduling.
+ */
+static void sched_stop_tick(struct rq *rq, int cpu)
+{
+ if (!hrexpiry_enabled(rq))
+ return;
+ if (!tick_nohz_full_enabled())
+ return;
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+
+static inline void sched_start_tick(struct rq *rq, int cpu)
+{
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+
+struct tick_work {
+ int cpu;
+ atomic_t state;
+ struct delayed_work work;
+};
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE 0
+#define TICK_SCHED_REMOTE_OFFLINING 1
+#define TICK_SCHED_REMOTE_RUNNING 2
+
+/*
+ * State diagram for ->state:
+ *
+ *
+ * TICK_SCHED_REMOTE_OFFLINE
+ * | ^
+ * | |
+ * | | sched_tick_remote()
+ * | |
+ * | |
+ * +--TICK_SCHED_REMOTE_OFFLINING
+ * | ^
+ * | |
+ * sched_tick_start() | | sched_tick_stop()
+ * | |
+ * V |
+ * TICK_SCHED_REMOTE_RUNNING
+ *
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
+ */
+
+static struct tick_work __percpu *tick_work_cpu;
+
+static void sched_tick_remote(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct tick_work *twork = container_of(dwork, struct tick_work, work);
+ int cpu = twork->cpu;
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *curr;
+ u64 delta;
+ int os;
+
+ /*
+ * Handle the tick only if it appears the remote CPU is running in full
+ * dynticks mode. The check is racy by nature, but missing a tick or
+ * having one too much is no big deal because the scheduler tick updates
+ * statistics and checks timeslices in a time-independent way, regardless
+ * of when exactly it is running.
+ */
+ if (!tick_nohz_tick_stopped_cpu(cpu))
+ goto out_requeue;
+
+ rq_lock_irq(rq);
+ if (cpu_is_offline(cpu))
+ goto out_unlock;
+
+ curr = rq->curr;
+ update_rq_clock(rq);
+
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a reasonable
+ * amount of time.
+ */
+ delta = rq_clock_task(rq) - curr->last_ran;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
+ task_tick(rq);
+
+out_unlock:
+ rq_unlock_irq(rq, NULL);
+
+out_requeue:
+
+ /*
+ * Run the remote tick once per second (1Hz). This arbitrary
+ * frequency is large enough to avoid overload but short enough
+ * to keep scheduler internal stats reasonably up to date. But
+ * first update state to reflect hotplug activity if required.
+ */
+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ if (os == TICK_SCHED_REMOTE_RUNNING)
+ queue_delayed_work(system_unbound_wq, dwork, HZ);
+}
+
+static void sched_tick_start(int cpu)
+{
+ struct tick_work *twork;
+ int os;
+
+ if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ return;
+
+ WARN_ON_ONCE(!tick_work_cpu);
+
+ twork = per_cpu_ptr(tick_work_cpu, cpu);
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ twork->cpu = cpu;
+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_tick_stop(int cpu)
+{
+ struct tick_work *twork;
+ int os;
+
+ if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ return;
+
+ WARN_ON_ONCE(!tick_work_cpu);
+
+ twork = per_cpu_ptr(tick_work_cpu, cpu);
+ /* There cannot be competing actions, but don't rely on stop-machine. */
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ /* Don't cancel, as this would mess up the state machine. */
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __init sched_tick_offload_init(void)
+{
+ tick_work_cpu = alloc_percpu(struct tick_work);
+ BUG_ON(!tick_work_cpu);
+ return 0;
+}
+
+#else /* !CONFIG_NO_HZ_FULL */
+static inline void sched_stop_tick(struct rq *rq, int cpu) {}
+static inline void sched_start_tick(struct rq *rq, int cpu) {}
+static inline void sched_tick_start(int cpu) { }
+static inline void sched_tick_stop(int cpu) { }
+#endif
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ */
+void scheduler_tick(void)
+{
+ int cpu __maybe_unused = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+
+ arch_scale_freq_tick();
+ sched_clock_tick();
+ update_clocks(rq);
+ update_load_avg(rq, 0);
+ update_cpu_clock_tick(rq, rq->curr);
+ task_tick(rq);
+ rq->last_scheduler_tick = rq->last_jiffy;
+ rq->last_tick = rq->clock;
+ psi_task_tick(rq);
+ perf_event_task_tick();
+ sched_stop_tick(rq, cpu);
+}
+
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_TRACE_PREEMPT_TOGGLE))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+ if (preempt_count() == val) {
+ unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
+}
+
+void preempt_count_add(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+ return;
+#endif
+ __preempt_count_add(val);
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Spinlock count overflowing soon?
+ */
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+ PREEMPT_MASK - 10);
+#endif
+ preempt_latency_start(val);
+}
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
+
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+ if (preempt_count() == val)
+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
+void preempt_count_sub(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ return;
+ /*
+ * Is the spinlock portion underflowing?
+ */
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+#endif
+
+ preempt_latency_stop(val);
+ __preempt_count_sub(val);
+}
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
+
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
+#endif
+
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ return p->preempt_disable_ip;
+#else
+ return 0;
+#endif
+}
+
+/*
+ * The time_slice is only refilled when it is empty and that is when we set a
+ * new deadline. Make sure update_clocks has been called recently to update
+ * rq->niffies.
+ */
+static void time_slice_expired(struct task_struct *p, struct rq *rq)
+{
+ p->time_slice = timeslice();
+ p->deadline = rq->niffies + task_deadline_diff(p);
+#ifdef CONFIG_SMT_NICE
+ if (!p->mm)
+ p->smt_bias = 0;
+ else if (rt_task(p))
+ p->smt_bias = 1 << 30;
+ else if (task_running_iso(p))
+ p->smt_bias = 1 << 29;
+ else if (idleprio_task(p)) {
+ if (task_running_idle(p))
+ p->smt_bias = 0;
+ else
+ p->smt_bias = 1;
+ } else if (--p->smt_bias < 1)
+ p->smt_bias = MAX_PRIO - p->static_prio;
+#endif
+}
+
+/*
+ * Timeslices below RESCHED_US are considered as good as expired as there's no
+ * point rescheduling when there's so little time left. SCHED_BATCH tasks
+ * have been flagged be not latency sensitive and likely to be fully CPU
+ * bound so every time they're rescheduled they have their time_slice
+ * refilled, but get a new later deadline to have little effect on
+ * SCHED_NORMAL tasks.
+
+ */
+static inline void check_deadline(struct task_struct *p, struct rq *rq)
+{
+ if (p->time_slice < RESCHED_US || batch_task(p))
+ time_slice_expired(p, rq);
+}
+
+/*
+ * Task selection with skiplists is a simple matter of picking off the first
+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1)
+ * being bound to the number of processors.
+ *
+ * Runqueues are selectively locked based on their unlocked data and then
+ * unlocked if not needed. At most 3 locks will be held at any time and are
+ * released as soon as they're no longer needed. All balancing between CPUs
+ * is thus done here in an extremely simple first come best fit manner.
+ *
+ * This iterates over runqueues in cache locality order. In interactive mode
+ * it iterates over all CPUs and finds the task with the best key/deadline.
+ * In non-interactive mode it will only take a task if it's from the current
+ * runqueue or a runqueue with more tasks than the current one with a better
+ * key/deadline.
+ */
+#ifdef CONFIG_SMP
+static inline struct task_struct
+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+ struct rq *locked = NULL, *chosen = NULL;
+ struct task_struct *edt = idle;
+ int i, best_entries = 0;
+ u64 best_key = ~0ULL;
+
+ for (i = 0; i < total_runqueues; i++) {
+ struct rq *other_rq = rq_order(rq, i);
+ skiplist_node *next;
+ int entries;
+
+ entries = other_rq->sl->entries;
+ /*
+ * Check for queued entres lockless first. The local runqueue
+ * is locked so entries will always be accurate.
+ */
+ if (!sched_interactive) {
+ /*
+ * Don't reschedule balance across nodes unless the CPU
+ * is idle.
+ */
+ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP)
+ break;
+ if (entries <= best_entries)
+ continue;
+ } else if (!entries)
+ continue;
+
+ /* if (i) implies other_rq != rq */
+ if (i) {
+ /* Check for best id queued lockless first */
+ if (other_rq->best_key >= best_key)
+ continue;
+
+ if (unlikely(!trylock_rq(rq, other_rq)))
+ continue;
+
+ /* Need to reevaluate entries after locking */
+ entries = other_rq->sl->entries;
+ if (unlikely(!entries)) {
+ unlock_rq(other_rq);
+ continue;
+ }
+ }
+
+ next = other_rq->node;
+ /*
+ * In interactive mode we check beyond the best entry on other
+ * runqueues if we can't get the best for smt or affinity
+ * reasons.
+ */
+ while ((next = next->next[0]) != other_rq->node) {
+ struct task_struct *p;
+ u64 key = next->key;
+
+ /* Reevaluate key after locking */
+ if (key >= best_key)
+ break;
+
+ p = next->value;
+ if (!smt_schedule(p, rq)) {
+ if (i && !sched_interactive)
+ break;
+ continue;
+ }
+
+ if (sched_other_cpu(p, cpu)) {
+ if (sched_interactive || !i)
+ continue;
+ break;
+ }
+ /* Make sure affinity is ok */
+ if (i) {
+ /* From this point on p is the best so far */
+ if (locked)
+ unlock_rq(locked);
+ chosen = locked = other_rq;
+ }
+ best_entries = entries;
+ best_key = key;
+ edt = p;
+ break;
+ }
+ /* rq->preempting is a hint only as the state may have changed
+ * since it was set with the resched call but if we have met
+ * the condition we can break out here. */
+ if (edt == rq->preempting)
+ break;
+ if (i && other_rq != chosen)
+ unlock_rq(other_rq);
+ }
+
+ if (likely(edt != idle))
+ take_task(rq, cpu, edt);
+
+ if (locked)
+ unlock_rq(locked);
+
+ rq->preempting = NULL;
+
+ return edt;
+}
+#else /* CONFIG_SMP */
+static inline struct task_struct
+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+ struct task_struct *edt;
+
+ if (unlikely(!rq->sl->entries))
+ return idle;
+ edt = rq->node->next[0]->value;
+ take_task(rq, cpu, edt);
+ return edt;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
+{
+ /* Save this before calling printk(), since that will clobber it */
+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
+ if (oops_in_progress)
+ return;
+
+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+ prev->comm, prev->pid, preempt_count());
+
+ debug_show_held_locks(prev);
+ print_modules();
+ if (irqs_disabled())
+ print_irqtrace_events(prev);
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
+ }
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+
+/*
+ * Various schedule()-time debugging checks and statistics:
+ */
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
+{
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+ if (task_stack_end_corrupted(prev))
+ panic("corrupted stack end detected inside scheduler\n");
+
+ if (task_scs_end_corrupted(prev))
+ panic("corrupted shadow stack detected inside scheduler\n");
+#endif
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ if (!preempt && prev->state && prev->non_block_count) {
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
+ prev->comm, prev->pid, prev->non_block_count);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+ }
+#endif
+
+ if (unlikely(in_atomic_preempt_off())) {
+ __schedule_bug(prev);
+ preempt_count_set(PREEMPT_DISABLED);
+ }
+ rcu_sleep_check();
+ SCHED_WARN_ON(ct_state() == CONTEXT_USER);
+
+ profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+ schedstat_inc(this_rq()->sched_count);
+}
+
+/*
+ * The currently running task's information is all stored in rq local data
+ * which is only modified by the local CPU.
+ */
+static inline void set_rq_task(struct rq *rq, struct task_struct *p)
+{
+ if (p == rq->idle || p->policy == SCHED_FIFO)
+ hrexpiry_clear(rq);
+ else
+ hrexpiry_start(rq, US_TO_NS(p->time_slice));
+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
+ rq->dither = 0;
+ else
+ rq->dither = rq_dither(rq);
+
+ rq->rq_deadline = p->deadline;
+ rq->rq_prio = p->prio;
+#ifdef CONFIG_SMT_NICE
+ rq->rq_mm = p->mm;
+ rq->rq_smt_bias = p->smt_bias;
+#endif
+}
+
+#ifdef CONFIG_SMT_NICE
+static void check_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings;
+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings;
+
+/* Iterate over smt siblings when we've scheduled a process on cpu and decide
+ * whether they should continue running or be descheduled. */
+static void check_smt_siblings(struct rq *this_rq)
+{
+ int other_cpu;
+
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
+ struct task_struct *p;
+ struct rq *rq;
+
+ rq = cpu_rq(other_cpu);
+ if (rq_idle(rq))
+ continue;
+ p = rq->curr;
+ if (!smt_schedule(p, this_rq))
+ resched_curr(rq);
+ }
+}
+
+static void wake_smt_siblings(struct rq *this_rq)
+{
+ int other_cpu;
+
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
+ struct rq *rq;
+
+ rq = cpu_rq(other_cpu);
+ if (rq_idle(rq))
+ resched_idle(rq);
+ }
+}
+#else
+static void check_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_siblings(struct rq __maybe_unused *this_rq) {}
+#endif
+
+/*
+ * schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ * paths. For example, see arch/x86/entry_64.S.
+ *
+ * To drive preemption between tasks, the scheduler sets the flag in timer
+ * interrupt handler scheduler_tick().
+ *
+ * 3. Wakeups don't really cause entry into schedule(). They add a
+ * task to the run-queue and that's it.
+ *
+ * Now, if the new task added to the run-queue preempts the current
+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ * called on the nearest possible occasion:
+ *
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
+ *
+ * - in syscall or exception context, at the next outmost
+ * preempt_enable(). (this might be as soon as the wake_up()'s
+ * spin_unlock()!)
+ *
+ * - in IRQ context, return from interrupt-handler to
+ * preemptible context
+ *
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
+ * then at the next:
+ *
+ * - cond_resched() call
+ * - explicit schedule() call
+ * - return from syscall or exception to user-space
+ * - return from interrupt-handler to user-space
+ *
+ * WARNING: must be called with preemption disabled!
+ */
+static void __sched notrace __schedule(bool preempt)
+{
+ struct task_struct *prev, *next, *idle;
+ unsigned long *switch_count;
+ unsigned long prev_state;
+ bool deactivate = false;
+ struct rq *rq;
+ u64 niffies;
+ int cpu;
+
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ prev = rq->curr;
+ idle = rq->idle;
+
+ schedule_debug(prev, preempt);
+
+ local_irq_disable();
+ rcu_note_context_switch(preempt);
+
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up():
+ *
+ * __set_current_state(@state) signal_wake_up()
+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
+ * wake_up_state(p, state)
+ * LOCK rq->lock LOCK p->pi_state
+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
+ * if (signal_pending_state()) if (p->state & @state)
+ *
+ * Also, the membarrier system call requires a full memory barrier
+ * after coming from user-space, before storing to rq->curr.
+ */
+ rq_lock(rq);
+ smp_mb__after_spinlock();
+#ifdef CONFIG_SMP
+ if (rq->preempt) {
+ /*
+ * Make sure resched_curr hasn't triggered a preemption
+ * locklessly on a task that has since scheduled away. Spurious
+ * wakeup of idle is okay though.
+ */
+ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) {
+ rq->preempt = NULL;
+ clear_preempt_need_resched();
+ rq_unlock_irq(rq, NULL);
+ return;
+ }
+ rq->preempt = NULL;
+ }
+#endif
+
+ switch_count = &prev->nivcsw;
+
+ /*
+ * We must load prev->state once (task_struct::state is volatile), such
+ * that:
+ *
+ * - we form a control dependency vs deactivate_task() below.
+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
+ */
+ prev_state = prev->state;
+ if (!preempt && prev_state) {
+ if (signal_pending_state(prev_state, prev)) {
+ prev->state = TASK_RUNNING;
+ } else {
+ prev->sched_contributes_to_load =
+ (prev_state & TASK_UNINTERRUPTIBLE) &&
+ !(prev_state & TASK_NOLOAD) &&
+ !(prev->flags & PF_FROZEN);
+
+ if (prev->sched_contributes_to_load)
+ rq->nr_uninterruptible++;
+
+ /*
+ * __schedule() ttwu()
+ * prev_state = prev->state; if (p->on_rq && ...)
+ * if (prev_state) goto out;
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
+ * p->state = TASK_WAKING
+ *
+ * Where __schedule() and ttwu() have matching control dependencies.
+ *
+ * After this, schedule() must not care about p->state any more.
+ */
+ deactivate = true;
+
+ if (prev->in_iowait) {
+ atomic_inc(&rq->nr_iowait);
+ delayacct_blkio_start();
+ }
+ }
+ switch_count = &prev->nvcsw;
+ }
+
+ /*
+ * Store the niffy value here for use by the next task's last_ran
+ * below to avoid losing niffies due to update_clocks being called
+ * again after this point.
+ */
+ update_clocks(rq);
+ niffies = rq->niffies;
+ update_cpu_clock_switch(rq, prev);
+
+ clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
+
+ if (idle != prev) {
+ check_deadline(prev, rq);
+ return_task(prev, rq, cpu, deactivate);
+ }
+
+ next = earliest_deadline_task(rq, cpu, idle);
+ if (likely(next->prio != PRIO_LIMIT))
+ clear_cpuidle_map(cpu);
+ else {
+ set_cpuidle_map(cpu);
+ update_load_avg(rq, 0);
+ }
+
+ set_rq_task(rq, next);
+ next->last_ran = niffies;
+
+ if (likely(prev != next)) {
+ /*
+ * Don't reschedule an idle task or deactivated tasks
+ */
+ if (prev == idle) {
+ inc_nr_running(rq);
+ if (rt_task(next))
+ rq->rt_nr_running++;
+ } else if (!deactivate)
+ resched_suitable_idle(prev);
+ if (unlikely(next == idle)) {
+ dec_nr_running(rq);
+ if (rt_task(prev))
+ rq->rt_nr_running--;
+ wake_siblings(rq);
+ } else
+ check_siblings(rq);
+ rq->nr_switches++;
+ /*
+ * RCU users of rcu_dereference(rq->curr) may not see
+ * changes to task_struct made by pick_next_task().
+ */
+ RCU_INIT_POINTER(rq->curr, next);
+ /*
+ * The membarrier system call requires each architecture
+ * to have a full memory barrier after updating
+ * rq->curr, before returning to user-space.
+ *
+ * Here are the schemes providing that barrier on the
+ * various architectures:
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - finish_lock_switch() for weakly-ordered
+ * architectures where spin_unlock is a full barrier,
+ * - switch_to() for arm64 (weakly-ordered, spin_unlock
+ * is a RELEASE barrier),
+ */
+ ++*switch_count;
+
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+
+ trace_sched_switch(preempt, prev, next);
+ context_switch(rq, prev, next); /* unlocks the rq */
+ } else {
+ check_siblings(rq);
+ rq_unlock(rq);
+ local_irq_enable();
+ }
+}
+
+void __noreturn do_task_dead(void)
+{
+ /* Causes final put_task_struct in finish_task_switch(). */
+ set_special_state(TASK_DEAD);
+
+ /* Tell freezer to ignore us: */
+ current->flags |= PF_NOFREEZE;
+ __schedule(false);
+ BUG();
+
+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
+ for (;;)
+ cpu_relax();
+}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+ unsigned int task_flags;
+
+ if (!tsk->state)
+ return;
+
+ task_flags = tsk->flags;
+ /*
+ * If a worker went to sleep, notify and ask workqueue whether
+ * it wants to wake up a task to maintain concurrency.
+ * As this function is called inside the schedule() context,
+ * we disable preemption to avoid it calling schedule() again
+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
+ * requires it.
+ */
+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ preempt_disable();
+ if (task_flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else
+ io_wq_worker_sleeping(tsk);
+ preempt_enable_no_resched();
+ }
+
+ if (tsk_is_pi_blocked(tsk))
+ return;
+
+ /*
+ * If we are going to sleep and we have plugged IO queued,
+ * make sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(tsk))
+ blk_schedule_flush_plug(tsk);
+}
+
+static inline void sched_update_worker(struct task_struct *tsk)
+{
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_running(tsk);
+ else
+ io_wq_worker_running(tsk);
+ }
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+ sched_submit_work(tsk);
+ do {
+ preempt_disable();
+ __schedule(false);
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+ sched_update_worker(tsk);
+}
+
+EXPORT_SYMBOL(schedule);
+
+/*
+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
+ * state (have scheduled out non-voluntarily) by making sure that all
+ * tasks have either left the run queue or have gone into user space.
+ * As idle tasks do not do either, they must not ever be preempted
+ * (schedule out non-voluntarily).
+ *
+ * schedule_idle() is similar to schedule_preempt_disable() except that it
+ * never enables preemption because it does not call sched_submit_work().
+ */
+void __sched schedule_idle(void)
+{
+ /*
+ * As this skips calling sched_submit_work(), which the idle task does
+ * regardless because that function is a nop when the task is in a
+ * TASK_RUNNING state, make sure this isn't used someplace that the
+ * current task can be in any other state. Note, idle is always in the
+ * TASK_RUNNING state.
+ */
+ WARN_ON_ONCE(current->state);
+ do {
+ __schedule(false);
+ } while (need_resched());
+}
+
+#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
+asmlinkage __visible void __sched schedule_user(void)
+{
+ /*
+ * If we come here after a random call to set_need_resched(),
+ * or we have been woken up remotely but the IPI has not yet arrived,
+ * we haven't yet exited the RCU idle mode. Do it here manually until
+ * we find a better solution.
+ *
+ * NB: There are buggy callers of this function. Ideally we
+ * should warn if prev_state != IN_USER, but that will trigger
+ * too frequently to make sense yet.
+ */
+ enum ctx_state prev_state = exception_enter();
+ schedule();
+ exception_exit(prev_state);
+}
+#endif
+
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+ sched_preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+}
+
+static void __sched notrace preempt_schedule_common(void)
+{
+ do {
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
+ preempt_disable_notrace();
+ preempt_latency_start(1);
+ __schedule(true);
+ preempt_latency_stop(1);
+ preempt_enable_no_resched_notrace();
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ } while (need_resched());
+}
+
+#ifdef CONFIG_PREEMPTION
+/*
+ * This is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule(void)
+{
+ /*
+ * If there is a non-zero preempt_count or interrupts are disabled,
+ * we do not want to preempt the current task. Just return..
+ */
+ if (likely(!preemptible()))
+ return;
+
+ preempt_schedule_common();
+}
+NOKPROBE_SYMBOL(preempt_schedule);
+EXPORT_SYMBOL(preempt_schedule);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+#endif
+
+
+/**
+ * preempt_schedule_notrace - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ do {
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
+ preempt_disable_notrace();
+ preempt_latency_start(1);
+ /*
+ * Needs preempt disabled in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ prev_ctx = exception_enter();
+ __schedule(true);
+ exception_exit(prev_ctx);
+
+ preempt_latency_stop(1);
+ preempt_enable_no_resched_notrace();
+ } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#endif
+
+#endif /* CONFIG_PREEMPTION */
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+#include <linux/entry-common.h>
+
+/*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ */
+
+enum {
+ preempt_dynamic_none = 0,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+};
+
+static int preempt_dynamic_mode = preempt_dynamic_full;
+
+static int sched_dynamic_mode(const char *str)
+{
+ if (!strcmp(str, "none"))
+ return 0;
+
+ if (!strcmp(str, "voluntary"))
+ return 1;
+
+ if (!strcmp(str, "full"))
+ return 2;
+
+ return -1;
+}
+
+static void sched_dynamic_update(int mode)
+{
+ /*
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
+ */
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, __cond_resched);
+ static_call_update(preempt_schedule, __preempt_schedule_func);
+ static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+ static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+ static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+ pr_info("Dynamic Preempt: none\n");
+ break;
+
+ case preempt_dynamic_voluntary:
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, __cond_resched);
+ static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+ static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+ static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
+
+ case preempt_dynamic_full:
+ static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(preempt_schedule, __preempt_schedule_func);
+ static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: full\n");
+ break;
+ }
+
+ preempt_dynamic_mode = mode;
+}
+
+static int __init setup_preempt_mode(char *str)
+{
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 1;
+ }
+
+ sched_dynamic_update(mode);
+ return 0;
+}
+__setup("preempt=", setup_preempt_mode);
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[16];
+ int mode;
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ mode = sched_dynamic_mode(strstrip(buf));
+ if (mode < 0)
+ return mode;
+
+ sched_dynamic_update(mode);
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_dynamic_show(struct seq_file *m, void *v)
+{
+ static const char * preempt_modes[] = {
+ "none", "voluntary", "full"
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, "(");
+ seq_puts(m, preempt_modes[i]);
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, ")");
+
+ seq_puts(m, " ");
+ }
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int sched_dynamic_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_dynamic_show, NULL);
+}
+
+static const struct file_operations sched_dynamic_fops = {
+ .open = sched_dynamic_open,
+ .write = sched_dynamic_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int sched_init_debug_dynamic(void)
+{
+ debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+ return 0;
+}
+late_initcall(sched_init_debug_dynamic);
+
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+/*
+ * This is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage __visible void __sched preempt_schedule_irq(void)
+{
+ enum ctx_state prev_state;
+
+ /* Catch callers which need to be fixed */
+ BUG_ON(preempt_count() || !irqs_disabled());
+
+ prev_state = exception_enter();
+
+ do {
+ preempt_disable();
+ local_irq_enable();
+ __schedule(true);
+ local_irq_disable();
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+
+ exception_exit(prev_state);
+}
+
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
+ void *key)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+ return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+#ifdef CONFIG_RT_MUTEXES
+
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task to boost
+ * @pi_task: donor task
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
+ */
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
+{
+ int prio, oldprio;
+ struct rq *rq;
+
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+ /*
+ * If nothing changed; bail early.
+ */
+ if (p->pi_top_task == pi_task && prio == p->prio)
+ return;
+
+ rq = __task_rq_lock(p, NULL);
+ update_rq_clock(rq);
+ /*
+ * Set under pi_lock && rq->lock, such that the value can be used under
+ * either lock.
+ *
+ * Note that there is loads of tricky to make this pointer cache work
+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+ * ensure a task is de-boosted (pi_task is set to NULL) before the
+ * task is allowed to run again (and can exit). This ensures the pointer
+ * points to a blocked task -- which guarantees the task is present.
+ */
+ p->pi_top_task = pi_task;
+
+ /*
+ * For FIFO/RR we only need to set prio, if that matches we're done.
+ */
+ if (prio == p->prio)
+ goto out_unlock;
+
+ /*
+ * Idle task boosting is a nono in general. There is one
+ * exception, when PREEMPT_RT and NOHZ is active:
+ *
+ * The idle task calls get_next_timer_interrupt() and holds
+ * the timer wheel base->lock on the CPU and another CPU wants
+ * to access the timer (probably to cancel it). We can safely
+ * ignore the boosting request, as the idle CPU runs this code
+ * with interrupts disabled and will complete the lock
+ * protected section without being interrupted. So there is no
+ * real need to boost.
+ */
+ if (unlikely(p == rq->idle)) {
+ WARN_ON(p != rq->curr);
+ WARN_ON(p->pi_blocked_on);
+ goto out_unlock;
+ }
+
+ trace_sched_pi_setprio(p, pi_task);
+ oldprio = p->prio;
+ p->prio = prio;
+ if (task_running(rq, p)){
+ if (prio > oldprio)
+ resched_task(p);
+ } else if (task_queued(p)) {
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
+ if (prio < oldprio)
+ try_preempt(p, rq);
+ }
+out_unlock:
+ /* Avoid rq from going away on us: */
+ preempt_disable();
+ __task_rq_unlock(rq, NULL);
+
+ preempt_enable();
+}
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ return prio;
+}
+#endif
+
+/*
+ * Adjust the deadline for when the priority is to change, before it's
+ * changed.
+ */
+static inline void adjust_deadline(struct task_struct *p, int new_prio)
+{
+ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
+}
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ int new_static, old_static;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+ return;
+ new_static = NICE_TO_PRIO(nice);
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it won't have any effect on scheduling until the task is
+ * not SCHED_NORMAL/SCHED_BATCH:
+ */
+ if (has_rt_policy(p)) {
+ p->static_prio = new_static;
+ goto out_unlock;
+ }
+
+ adjust_deadline(p, new_static);
+ old_static = p->static_prio;
+ p->static_prio = new_static;
+ p->prio = effective_prio(p);
+
+ if (task_queued(p)) {
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
+ if (new_static < old_static)
+ try_preempt(p, rq);
+ } else if (task_running(rq, p)) {
+ set_rq_task(rq, p);
+ if (old_static < new_static)
+ resched_task(p);
+ }
+out_unlock:
+ task_rq_unlock(rq, p, &rf);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ /* Convert nice value [19,-20] to rlimit style value [1,40] */
+ int nice_rlim = nice_to_rlimit(nice);
+
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+ capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+ if (increment < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ *
+ * sched policy return value kernel prio user prio/nice
+ *
+ * normal, batch, [1 ... 41] 101 0/[-20 ... 19]
+ * idle [42 ... 81] 102 0/[-20 ... 19]
+ * iso [0 ... 41] 100 0/[-20 ... 19]
+ * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
+ */
+int task_prio(const struct task_struct *p)
+{
+ int delta, prio = p->prio - MAX_RT_PRIO;
+
+ /* rt tasks and iso tasks */
+ if (prio <= 0)
+ goto out;
+
+ /* Convert to ms to avoid overflows */
+ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies);
+ if (unlikely(delta < 0))
+ delta = 0;
+ delta = delta * 40 / ms_longest_deadline_diff();
+ if (delta <= 80)
+ prio += delta;
+ if (idleprio_task(p))
+ prio += 40;
+out:
+ return prio;
+}
+
+#ifdef CONFIG_SMP
+static inline bool rt_rq_is_runnable(struct rq *rt_rq)
+{
+ return rt_rq->rt_nr_running;
+}
+
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum cpu_util_type type,
+ struct task_struct *p)
+{
+ unsigned long dl_util, util, irq;
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!uclamp_is_used() &&
+ type == FREQUENCY_UTIL && rt_rq_is_runnable(rq)) {
+ return max;
+ }
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= max))
+ return max;
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ *
+ * CFS and RT utilization can be boosted or capped, depending on
+ * utilization clamp constraints requested by currently RUNNABLE
+ * tasks.
+ * When there are no CFS RUNNABLE tasks, clamps are released and
+ * frequency will be gracefully reduced with the utilization decay.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ if (type == FREQUENCY_UTIL)
+ util = uclamp_rq_util_with(rq, util, p);
+
+ dl_util = cpu_util_dl(rq);
+
+ /*
+ * For frequency selection we do not make cpu_util_dl() a permanent part
+ * of this sum because we want to use cpu_bw_dl() later on, but we need
+ * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+ * that we select f_max when there is no idle time.
+ *
+ * NOTE: numerical errors or stop class might cause us to not quite hit
+ * saturation when we should -- something for later.
+ */
+ if (util + dl_util >= max)
+ return max;
+
+ /*
+ * OTOH, for energy computation we need the estimated running time, so
+ * include util_dl and ignore dl_bw.
+ */
+ if (type == ENERGY_UTIL)
+ util += dl_util;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * irq metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, max);
+ util += irq;
+
+ /*
+ * Bandwidth required by DEADLINE must always be granted while, for
+ * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
+ * to gracefully reduce the frequency when no tasks show up for longer
+ * periods of time.
+ *
+ * Ideally we would like to set bw_dl as min/guaranteed freq and util +
+ * bw_dl as requested freq. However, cpufreq is not yet ready for such
+ * an interface. So, we only do the latter for now.
+ */
+ if (type == FREQUENCY_UTIL)
+ util += cpu_bw_dl(rq);
+
+ return min(max, util);
+}
+
+unsigned long sched_cpu_util(int cpu, unsigned long max)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+ ENERGY_UTIL, NULL);
+}
+#endif /* CONFIG_SMP */
+
+/**
+ * idle_cpu - is a given CPU idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle)
+ return 0;
+
+ if (rq->nr_running)
+ return 0;
+
+#ifdef CONFIG_SMP
+ if (rq->ttwu_pending)
+ return 0;
+#endif
+
+ return 1;
+}
+
+/**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+ if (!idle_cpu(cpu))
+ return 0;
+
+ if (vcpu_is_preempted(cpu))
+ return 0;
+
+ return 1;
+}
+
+/**
+ * idle_task - return the idle task for a given CPU.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the CPU @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_vpid(pid) : current;
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
+ int prio, const struct sched_attr *attr,
+ bool keep_boost)
+{
+ int oldrtprio, oldprio;
+
+ /*
+ * If params can't change scheduling class changes aren't allowed
+ * either.
+ */
+ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ return;
+
+ p->policy = policy;
+ oldrtprio = p->rt_priority;
+ p->rt_priority = prio;
+ p->normal_prio = normal_prio(p);
+ oldprio = p->prio;
+ /*
+ * Keep a potential priority boosting if called from
+ * sched_setscheduler().
+ */
+ p->prio = normal_prio(p);
+ if (keep_boost)
+ p->prio = rt_effective_prio(p, p->prio);
+
+ if (task_running(rq, p)) {
+ set_rq_task(rq, p);
+ resched_task(p);
+ } else if (task_queued(p)) {
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
+ if (p->prio < oldprio || p->rt_priority > oldrtprio)
+ try_preempt(p, rq);
+ }
+}
+
+/*
+ * Check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ bool match;
+
+ rcu_read_lock();
+ pcred = __task_cred(p);
+ match = (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
+ rcu_read_unlock();
+ return match;
+}
+
+static int __sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ bool user, bool pi)
+{
+ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority;
+ unsigned long rlim_rtprio = 0;
+ struct rq_flags rf;
+ int reset_on_fork;
+ struct rq *rq;
+
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
+
+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
+ unsigned long lflags;
+
+ if (!lock_task_sighand(p, &lflags))
+ return -ESRCH;
+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+ unlock_task_sighand(p, &lflags);
+ if (rlim_rtprio)
+ goto recheck;
+ /*
+ * If the caller requested an RT policy without having the
+ * necessary rights, we downgrade the policy to SCHED_ISO.
+ * We also set the parameter to zero to pass the checks.
+ */
+ policy = SCHED_ISO;
+ priority = 0;
+ }
+recheck:
+ /* Double check policy once rq lock held */
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
+ policy = oldpolicy = p->policy;
+ } else {
+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+ policy &= ~SCHED_RESET_ON_FORK;
+
+ if (!SCHED_RANGE(policy))
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+ return -EINVAL;
+
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL and
+ * SCHED_BATCH is 0.
+ */
+ if (priority > MAX_RT_PRIO-1)
+ return -EINVAL;
+ if (is_rt_policy(policy) != (priority != 0))
+ return -EINVAL;
+
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+ if (user && !capable(CAP_SYS_NICE)) {
+ if (is_rt_policy(policy)) {
+ unsigned long rlim_rtprio =
+ task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy */
+ if (policy != p->policy && !rlim_rtprio)
+ return -EPERM;
+
+ /* Can't increase priority */
+ if (priority > p->rt_priority &&
+ priority > rlim_rtprio)
+ return -EPERM;
+ } else {
+ switch (p->policy) {
+ /*
+ * Can only downgrade policies but not back to
+ * SCHED_NORMAL
+ */
+ case SCHED_ISO:
+ if (policy == SCHED_ISO)
+ goto out;
+ if (policy != SCHED_NORMAL)
+ return -EPERM;
+ break;
+ case SCHED_BATCH:
+ if (policy == SCHED_BATCH)
+ goto out;
+ if (policy != SCHED_IDLEPRIO)
+ return -EPERM;
+ break;
+ case SCHED_IDLEPRIO:
+ if (policy == SCHED_IDLEPRIO)
+ goto out;
+ return -EPERM;
+ default:
+ break;
+ }
+ }
+
+ /* Can't change other user's priorities */
+ if (!check_same_owner(p))
+ return -EPERM;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ return -EPERM;
+ }
+
+ if (user) {
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+ }
+
+ if (pi)
+ cpuset_read_lock();
+
+ /*
+ * Make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ *
+ * To be able to change p->policy safely, the runqueue lock must be
+ * held.
+ */
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ /*
+ * Changing the policy of the stop threads its a very bad idea:
+ */
+ if (p == rq->stop) {
+ retval = -EINVAL;
+ goto unlock;
+ }
+
+ /*
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
+ */
+ if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
+ priority == p->rt_priority))) {
+ p->sched_reset_on_fork = reset_on_fork;
+ retval = 0;
+ goto unlock;
+ }
+
+ /* Re-check policy now with rq lock held */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
+ goto recheck;
+ }
+ p->sched_reset_on_fork = reset_on_fork;
+
+ __setscheduler(p, rq, policy, priority, attr, pi);
+
+ /* Avoid rq from going away on us: */
+ preempt_disable();
+ task_rq_unlock(rq, p, &rf);
+
+ if (pi) {
+ cpuset_read_unlock();
+ rt_mutex_adjust_pi(p);
+ }
+ preempt_enable();
+out:
+ return 0;
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
+ return retval;
+}
+
+static int _sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool check)
+{
+ struct sched_attr attr = {
+ .sched_policy = policy,
+ .sched_priority = param->sched_priority,
+ .sched_nice = PRIO_TO_NICE(p->static_prio),
+ };
+
+ return __sched_setscheduler(p, &attr, check, true);
+}
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Use sched_set_fifo(), read its comment.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, true);
+}
+
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, true, true);
+}
+
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, false, true);
+}
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission. For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, false);
+}
+
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ * MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+void sched_set_fifo(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+void sched_set_fifo_low(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+void sched_set_normal(struct task_struct *p, int nice)
+{
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = nice,
+ };
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+ rcu_read_unlock();
+
+ if (likely(p)) {
+ retval = sched_setscheduler(p, policy, &lparam);
+ put_task_struct(p);
+ }
+
+ return retval;
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ /* Zero the full structure, so that a short copy will be nice: */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ /* ABI compatibility quirk: */
+ if (!size)
+ size = SCHED_ATTR_SIZE_VER0;
+
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
+ goto err_size;
+
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
+ }
+
+ /*
+ * XXX: Do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+
+ /* sched/core.c uses zero here but we already know ret is zero */
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
+{
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || flags)
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ attr.sched_policy = SETPARAM_POLICY;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+ rcu_read_unlock();
+
+ if (likely(p)) {
+ retval = sched_setattr(p, &attr);
+ put_task_struct(p);
+ }
+
+ return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+ struct task_struct *p;
+ int retval = -EINVAL;
+
+ if (pid < 0)
+ goto out_nounlock;
+
+ retval = -ESRCH;
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ if (p) {
+ retval = security_task_getscheduler(p);
+ if (!retval)
+ retval = p->policy;
+ }
+ rcu_read_unlock();
+
+out_nounlock:
+ return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+ struct sched_param lp = { .sched_priority = 0 };
+ struct task_struct *p;
+ int retval = -EINVAL;
+
+ if (!param || pid < 0)
+ goto out_nounlock;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ if (has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ rcu_read_unlock();
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ struct sched_attr *kattr,
+ unsigned int usize)
+{
+ unsigned int ksize = sizeof(*kattr);
+
+ if (!access_ok(uattr, usize))
+ return -EFAULT;
+
+ /*
+ * sched_getattr() ABI forwards and backwards compatibility:
+ *
+ * If usize == ksize then we just copy everything to user-space and all is good.
+ *
+ * If usize < ksize then we only copy as much as user-space has space for,
+ * this keeps ABI compatibility as well. We skip the rest.
+ *
+ * If usize > ksize then user-space is using a newer version of the ABI,
+ * which part the kernel doesn't know about. Just ignore it - tooling can
+ * detect the kernel's knowledge of attributes from the attr->size value
+ * which is set to ksize in this case.
+ */
+ kattr->size = min(usize, ksize);
+
+ if (copy_to_user(uattr, kattr, kattr->size))
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @usize: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, usize, unsigned int, flags)
+{
+ struct sched_attr kattr = { };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ kattr.sched_policy = p->policy;
+ if (rt_task(p))
+ kattr.sched_priority = p->rt_priority;
+ else
+ kattr.sched_nice = task_nice(p);
+
+ rcu_read_unlock();
+
+ return sched_attr_copy_to_user(uattr, &kattr, usize);
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ cpumask_var_t cpus_allowed, new_mask;
+ struct task_struct *p;
+ int retval;
+
+ rcu_read_lock();
+
+ p = find_process_by_pid(pid);
+ if (!p) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ /* Prevent p going away */
+ get_task_struct(p);
+ rcu_read_unlock();
+
+ if (p->flags & PF_NO_SETAFFINITY) {
+ retval = -EINVAL;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+ retval = -EPERM;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+ }
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, in_mask, cpus_allowed);
+again:
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+
+ if (!retval) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset
+ * update. Just reset the cpus_allowed to the
+ * cpuset's cpus_allowed
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+ goto again;
+ }
+ }
+out_unlock:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+out_put_task:
+ put_task_struct(p);
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ cpumask_t *new_mask)
+{
+ if (len < cpumask_size())
+ cpumask_clear(new_mask);
+ else if (len > cpumask_size())
+ len = cpumask_size();
+
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+
+/**
+ * sys_sched_setaffinity - set the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new CPU mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ cpumask_var_t new_mask;
+ int retval;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval == 0)
+ retval = sched_setaffinity(pid, new_mask);
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+ struct task_struct *p;
+ unsigned long flags;
+ int retval;
+
+ get_online_cpus();
+ rcu_read_lock();
+
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+out_unlock:
+ rcu_read_unlock();
+ put_online_cpus();
+
+ return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current CPU mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_var_t mask;
+
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
+ return -EINVAL;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ unsigned int retlen = min(len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, mask, retlen))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ free_cpumask_var(mask);
+
+ return ret;
+}
+
+static void do_sched_yield(void)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (!sched_yield_type)
+ return;
+
+ rq = this_rq_lock_irq(&rf);
+
+ if (sched_yield_type > 1)
+ time_slice_expired(current, rq);
+ schedstat_inc(rq->yld_count);
+
+ preempt_disable();
+ rq_unlock_irq(rq, &rf);
+ sched_preempt_enable_no_resched();
+
+ schedule();
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+ do_sched_yield();
+ return 0;
+}
+
+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+int __sched __cond_resched(void)
+{
+ if (should_resched(0)) {
+ preempt_schedule_common();
+ return 1;
+ }
+#ifndef CONFIG_PREEMPT_RCU
+ rcu_all_qs();
+#endif
+ return 0;
+}
+EXPORT_SYMBOL(__cond_resched);
+#endif
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(cond_resched);
+
+DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(might_resched);
+#endif
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held(lock);
+
+ if (spin_needbreak(lock) || resched) {
+ spin_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ spin_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+int __cond_resched_rwlock_read(rwlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held_read(lock);
+
+ if (rwlock_needbreak(lock) || resched) {
+ read_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ read_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_read);
+
+int __cond_resched_rwlock_write(rwlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held_write(lock);
+
+ if (rwlock_needbreak(lock) || resched) {
+ write_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ write_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_write);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, it's already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ do_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *rq_p;
+ struct rq *rq, *p_rq;
+ unsigned long flags;
+ int yielded = 0;
+
+ local_irq_save(flags);
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (task_running(p_rq, p) || p->state) {
+ yielded = -ESRCH;
+ goto out_irq;
+ }
+
+ double_rq_lock(rq, p_rq);
+ if (unlikely(task_rq(p) != p_rq)) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
+ }
+
+ yielded = 1;
+ schedstat_inc(rq->yld_count);
+ rq_p = rq->curr;
+ if (p->deadline > rq_p->deadline)
+ p->deadline = rq_p->deadline;
+ p->time_slice += rq_p->time_slice;
+ if (p->time_slice > timeslice())
+ p->time_slice = timeslice();
+ time_slice_expired(rq_p, rq);
+ if (preempt && rq != p_rq)
+ resched_task(p_rq->curr);
+ double_rq_unlock(rq, p_rq);
+out_irq:
+ local_irq_restore(flags);
+
+ if (yielded > 0)
+ schedule();
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+int io_schedule_prepare(void)
+{
+ int old_iowait = current->in_iowait;
+
+ current->in_iowait = 1;
+ blk_schedule_flush_plug(current);
+
+ return old_iowait;
+}
+
+void io_schedule_finish(int token)
+{
+ current->in_iowait = token;
+}
+
+/*
+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+
+long __sched io_schedule_timeout(long timeout)
+{
+ int token;
+ long ret;
+
+ token = io_schedule_prepare();
+ ret = schedule_timeout(timeout);
+ io_schedule_finish(token);
+
+ return ret;
+}
+EXPORT_SYMBOL(io_schedule_timeout);
+
+void __sched io_schedule(void)
+{
+ int token;
+
+ token = io_schedule_prepare();
+ schedule();
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(io_schedule);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_RT_PRIO-1;
+ break;
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_ISO:
+ case SCHED_IDLEPRIO:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_ISO:
+ case SCHED_IDLEPRIO:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
+{
+ struct task_struct *p;
+ unsigned int time_slice;
+ struct rq_flags rf;
+ struct rq *rq;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ retval = -ESRCH;
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ rq = task_rq_lock(p, &rf);
+ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
+ task_rq_unlock(rq, p, &rf);
+
+ rcu_read_unlock();
+ *t = ns_to_timespec64(time_slice);
+ return 0;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct __kernel_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
+ struct old_timespec32 __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_old_timespec32(&t, interval);
+ return retval;
+}
+#endif
+
+void sched_show_task(struct task_struct *p)
+{
+ unsigned long free = 0;
+ int ppid;
+
+ if (!try_get_task_stack(p))
+ return;
+
+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+ if (p->state == TASK_RUNNING)
+ printk(KERN_CONT " running task ");
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ free = stack_not_used(p);
+#endif
+ ppid = 0;
+ rcu_read_lock();
+ if (pid_alive(p))
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ rcu_read_unlock();
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ free, task_pid_nr(p), ppid,
+ (unsigned long)task_thread_info(p)->flags);
+
+ print_worker_info(KERN_INFO, p);
+ print_stop_info(KERN_INFO, p);
+ show_stack(p, NULL, KERN_INFO);
+ put_task_stack(p);
+}
+EXPORT_SYMBOL_GPL(sched_show_task);
+
+static inline bool
+state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+ /* no filter, everything matches */
+ if (!state_filter)
+ return true;
+
+ /* filter, but doesn't match */
+ if (!(p->state & state_filter))
+ return false;
+
+ /*
+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
+ * TASK_KILLABLE).
+ */
+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ return false;
+
+ return true;
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+ struct task_struct *g, *p;
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ /*
+ * reset the NMI-timeout, listing all files on a slow
+ * console might take a lot of time:
+ * Also, reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI.
+ */
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
+ if (state_filter_match(state_filter, p))
+ sched_show_task(p);
+ }
+
+ rcu_read_unlock();
+ /*
+ * Only show locks if all tasks are dumped:
+ */
+ if (!state_filter)
+ debug_show_all_locks();
+}
+
+void dump_cpu_task(int cpu)
+{
+ pr_info("Task dump for CPU %d:\n", cpu);
+ sched_show_task(cpu_curr(cpu));
+}
+
+#ifdef CONFIG_SMP
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 __always_unused flags)
+{
+ cpumask_copy(&p->cpus_mask, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(&p->pi_lock);
+
+ cpumask_copy(&p->cpus_mask, new_mask);
+
+ if (task_queued(p)) {
+ /*
+ * Because __kthread_bind() calls this on blocked tasks without
+ * holding rq->lock.
+ */
+ lockdep_assert_held(rq->lock);
+ }
+}
+
+/*
+ * Calling do_set_cpus_allowed from outside the scheduler code should not be
+ * called on a running or queued task. We should be holding pi_lock.
+ */
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ __do_set_cpus_allowed(p, new_mask);
+ if (needs_other_cpu(p, task_cpu(p))) {
+ struct rq *rq;
+
+ rq = __task_rq_lock(p, NULL);
+ set_task_cpu(p, valid_task_cpu(p));
+ resched_task(p);
+ __task_rq_unlock(rq, NULL);
+ }
+}
+
+void migrate_disable(void)
+{
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+#endif
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void __init init_idle(struct task_struct *idle, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /*
+ * The idle task doesn't need the kthread struct to function, but it
+ * is dressed up as a per-CPU kthread and thus needs to play the part
+ * if we want to avoid special-casing it in code that deals with per-CPU
+ * kthreads.
+ */
+ set_kthread_struct(idle);
+
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_lock(rq->lock);
+ idle->last_ran = rq->niffies;
+ time_slice_expired(idle, rq);
+ idle->state = TASK_RUNNING;
+ /* Setting prio to illegal value shouldn't matter when never queued */
+ idle->prio = PRIO_LIMIT;
+ /*
+ * PF_KTHREAD should already be set at this point; regardless, make it
+ * look like a proper per-CPU kthread.
+ */
+ idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
+ kthread_set_per_cpu(idle, cpu);
+
+ scs_task_reset(idle);
+ kasan_unpoison_task_stack(idle);
+
+#ifdef CONFIG_SMP
+ /*
+ * It's possible that init_idle() gets called multiple times on a task,
+ * in that case do_set_cpus_allowed() will not do the right thing.
+ *
+ * And since this is boot we can forgo the serialisation.
+ */
+ set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
+#ifdef CONFIG_SMT_NICE
+ idle->smt_bias = 0;
+#endif
+#endif
+ set_rq_task(rq, idle);
+
+ /* Silence PROVE_RCU */
+ rcu_read_lock();
+ set_task_cpu(idle, cpu);
+ rcu_read_unlock();
+
+ rq->idle = idle;
+ rcu_assign_pointer(rq->curr, idle);
+ idle->on_rq = TASK_ON_RQ_QUEUED;
+ raw_spin_unlock(rq->lock);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
+
+ /* Set the preempt count _outside_ the spinlocks! */
+ init_idle_preempt_count(idle, cpu);
+
+ ftrace_graph_init_idle_task(idle, cpu);
+ vtime_init_idle(idle, cpu);
+#ifdef CONFIG_SMP
+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,
+ const struct cpumask __maybe_unused *trial)
+{
+ return 1;
+}
+
+int task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed)
+{
+ int ret = 0;
+
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their CPU
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_mask may be changed.
+ */
+ if (p->flags & PF_NO_SETAFFINITY)
+ ret = -EINVAL;
+
+ return ret;
+}
+
+void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
+void select_nohz_load_balancer(int stop_tick)
+{
+}
+
+void set_cpu_sd_state_idle(void) {}
+void nohz_balance_enter_idle(int cpu) {}
+
+/*
+ * In the semi idle case, use the nearest busy CPU for migrating timers
+ * from an idle CPU. This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle CPU will add more delays to the timers than intended
+ * (as that CPU's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+ int i, cpu = smp_processor_id(), default_cpu = -1;
+ struct sched_domain *sd;
+
+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+ if (!idle_cpu(cpu))
+ return cpu;
+ default_cpu = cpu;
+ }
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ for_each_cpu_and(i, sched_domain_span(sd),
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
+ if (cpu == i)
+ continue;
+
+ if (!idle_cpu(i)) {
+ cpu = i;
+ goto unlock;
+ }
+ }
+ }
+
+ if (default_cpu == -1)
+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ cpu = default_cpu;
+unlock:
+ rcu_read_unlock();
+ return cpu;
+}
+
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+ if (cpu == smp_processor_id())
+ return;
+
+ if (set_nr_and_not_polling(cpu_rq(cpu)->idle))
+ smp_sched_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+ /*
+ * We just need the target to call irq_exit() and re-evaluate
+ * the next tick. The nohz full kick at least implies that.
+ * If needed we can still optimize that later with an
+ * empty IRQ.
+ */
+ if (cpu_is_offline(cpu))
+ return true; /* Don't try to wake offline CPUs. */
+ if (tick_nohz_full_cpu(cpu)) {
+ if (cpu != smp_processor_id() ||
+ tick_nohz_tick_stopped())
+ tick_nohz_full_kick_cpu(cpu);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Wake up the specified CPU. If the CPU is going offline, it is the
+ * caller's responsibility to deal with the lost wakeup, for example,
+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
+ */
+void wake_up_nohz_cpu(int cpu)
+{
+ if (!wake_up_full_nohz_cpu(cpu))
+ wake_up_idle_cpu(cpu);
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask,
+ u32 flags)
+{
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ bool queued = false, running_wrong = false, kthread;
+ unsigned int dest_cpu;
+ struct rq_flags rf;
+ struct rq *rq;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ kthread = !!(p->flags & PF_KTHREAD);
+ if (kthread) {
+ /*
+ * Kernel threads are allowed on online && !active CPUs
+ */
+ cpu_valid_mask = cpu_online_mask;
+ }
+
+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (cpumask_equal(&p->cpus_mask, new_mask))
+ goto out;
+ /*
+ * Picking a ~random cpu helps in cases where we are changing affinity
+ * for groups of tasks (ie. cpuset), so that load balancing is not
+ * immediately required to distribute the tasks within their new mask.
+ */
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
+ if (dest_cpu >= nr_cpu_ids) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ queued = task_queued(p);
+ __do_set_cpus_allowed(p, new_mask);
+
+ if (kthread) {
+ /*
+ * For kernel threads that do indeed end up on online &&
+ * !active we want to ensure they are strict per-CPU threads.
+ */
+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+ !cpumask_intersects(new_mask, cpu_active_mask) &&
+ p->nr_cpus_allowed != 1);
+ }
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ if (task_running(rq, p)) {
+ /* Task is running on the wrong cpu now, reschedule it. */
+ if (rq == this_rq()) {
+ set_task_cpu(p, dest_cpu);
+ set_tsk_need_resched(p);
+ running_wrong = true;
+ } else
+ resched_task(p);
+ } else {
+ if (queued) {
+ /*
+ * Switch runqueue locks after dequeueing the task
+ * here while still holding the pi_lock to be holding
+ * the correct lock for enqueueing.
+ */
+ dequeue_task(rq, p, 0);
+ rq_unlock(rq);
+
+ rq = cpu_rq(dest_cpu);
+ rq_lock(rq);
+ }
+ set_task_cpu(p, dest_cpu);
+ if (queued)
+ enqueue_task(rq, p, 0);
+ }
+ if (queued)
+ try_preempt(p, rq);
+ if (running_wrong)
+ preempt_disable();
+out:
+ task_rq_unlock(rq, p, &rf);
+
+ if (running_wrong) {
+ __schedule(true);
+ preempt_enable();
+ }
+
+ return ret;
+}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, new_mask, 0);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Run through task list and find tasks affined to the dead cpu, then remove
+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold
+ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and
+ * pi_lock to change cpus_mask but it's not going to matter here.
+ */
+static void bind_zero(int src_cpu)
+{
+ struct task_struct *p, *t;
+ struct rq *rq0;
+ int bound = 0;
+
+ if (src_cpu == 0)
+ return;
+
+ rq0 = cpu_rq(0);
+
+ do_each_thread(t, p) {
+ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) {
+ bool local = (task_cpu(p) == src_cpu);
+ struct rq *rq = task_rq(p);
+
+ /* task_running is the cpu stopper thread */
+ if (local && task_running(rq, p))
+ continue;
+ atomic_clear_cpu(src_cpu, &p->cpus_mask);
+ atomic_set_cpu(0, &p->cpus_mask);
+ p->zerobound = true;
+ bound++;
+ if (local) {
+ bool queued = task_queued(p);
+
+ if (queued)
+ dequeue_task(rq, p, 0);
+ set_task_cpu(p, 0);
+ if (queued)
+ enqueue_task(rq0, p, 0);
+ }
+ }
+ } while_each_thread(t, p);
+
+ if (bound) {
+ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n",
+ bound, src_cpu);
+ }
+}
+
+/* Find processes with the zerobound flag and reenable their affinity for the
+ * CPU coming alive. */
+static void unbind_zero(int src_cpu)
+{
+ int unbound = 0, zerobound = 0;
+ struct task_struct *p, *t;
+
+ if (src_cpu == 0)
+ return;
+
+ do_each_thread(t, p) {
+ if (!p->mm)
+ p->zerobound = false;
+ if (p->zerobound) {
+ unbound++;
+ cpumask_set_cpu(src_cpu, &p->cpus_mask);
+ /* Once every CPU affinity has been re-enabled, remove
+ * the zerobound flag */
+ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) {
+ p->zerobound = false;
+ zerobound++;
+ }
+ }
+ } while_each_thread(t, p);
+
+ if (unbound) {
+ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n",
+ unbound, src_cpu);
+ }
+ if (zerobound) {
+ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n",
+ zerobound);
+ }
+}
+
+/*
+ * Ensure that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+ struct mm_struct *mm = current->active_mm;
+
+ BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);
+
+ if (mm != &init_mm) {
+ switch_mm(mm, &init_mm, current);
+ finish_arch_post_lock_switch();
+ }
+
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
+}
+#else /* CONFIG_HOTPLUG_CPU */
+static void unbind_zero(int src_cpu) {}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param stop_param = { .sched_priority = STOP_PRIO };
+ struct sched_param start_param = { .sched_priority = 0 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling policy so that
+ * it can die in pieces.
+ */
+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
+ }
+}
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+ {
+ .procname = "sched_domain",
+ .mode = 0555,
+ },
+ {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = sd_ctl_dir,
+ },
+ {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+ return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+ struct ctl_table *entry;
+
+ /*
+ * In the intermediate directories, both the child directory and
+ * procname are dynamically allocated and could fail but the mode
+ * will always be set. In the lowest directory the names are
+ * static strings and all have proc handlers.
+ */
+ for (entry = *tablep; entry->mode; entry++) {
+ if (entry->child)
+ sd_free_ctl_entry(&entry->child);
+ if (entry->proc_handler == NULL)
+ kfree(entry->procname);
+ }
+
+ kfree(*tablep);
+ *tablep = NULL;
+}
+
+static void
+set_table_entry(struct ctl_table *entry,
+ const char *procname, void *data, int maxlen,
+ umode_t mode, proc_handler *proc_handler)
+{
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(9);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
+ /* &table[8] is terminator */
+
+ return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+ if (table == NULL)
+ return NULL;
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static cpumask_var_t sd_sysctl_cpus;
+static struct ctl_table_header *sd_sysctl_header;
+
+void register_sched_domain_sysctl(void)
+{
+ static struct ctl_table *cpu_entries;
+ static struct ctl_table **cpu_idx;
+ char buf[32];
+ int i;
+
+ if (!cpu_entries) {
+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
+ if (!cpu_entries)
+ return;
+
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = cpu_entries;
+ }
+
+ if (!cpu_idx) {
+ struct ctl_table *e = cpu_entries;
+
+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
+ if (!cpu_idx)
+ return;
+
+ /* deal with sparse possible map */
+ for_each_possible_cpu(i) {
+ cpu_idx[i] = e;
+ e++;
+ }
+ }
+
+ if (!cpumask_available(sd_sysctl_cpus)) {
+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
+ return;
+
+ /* init to possible to not have holes in @cpu_entries */
+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+ }
+
+ for_each_cpu(i, sd_sysctl_cpus) {
+ struct ctl_table *e = cpu_idx[i];
+
+ if (e->child)
+ sd_free_ctl_entry(&e->child);
+
+ if (!e->procname) {
+ snprintf(buf, 32, "cpu%d", i);
+ e->procname = kstrdup(buf, GFP_KERNEL);
+ }
+ e->mode = 0555;
+ e->child = sd_alloc_ctl_cpu_table(i);
+
+ __cpumask_clear_cpu(i, sd_sysctl_cpus);
+ }
+
+ WARN_ON(sd_sysctl_header);
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+void dirty_sched_domain_sysctl(int cpu)
+{
+ if (cpumask_available(sd_sysctl_cpus))
+ __cpumask_set_cpu(cpu, sd_sysctl_cpus);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+ unregister_sysctl_table(sd_sysctl_header);
+ sd_sysctl_header = NULL;
+}
+#endif /* CONFIG_SYSCTL */
+
+void set_rq_online(struct rq *rq)
+{
+ if (!rq->online) {
+ cpumask_set_cpu(cpu_of(rq), rq->rd->online);
+ rq->online = true;
+ }
+}
+
+void set_rq_offline(struct rq *rq)
+{
+ if (rq->online) {
+ int cpu = cpu_of(rq);
+
+ cpumask_clear_cpu(cpu, rq->rd->online);
+ rq->online = false;
+ clear_cpuidle_map(cpu);
+ }
+}
+
+/*
+ * used to mark begin/end of suspend/resume:
+ */
+static int num_cpus_frozen;
+
+/*
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
+ */
+static void cpuset_cpu_active(void)
+{
+ if (cpuhp_tasks_frozen) {
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
+ return;
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+ cpuset_force_rebuild();
+ }
+
+ cpuset_update_active_cpus();
+}
+
+static int cpuset_cpu_inactive(unsigned int cpu)
+{
+ if (!cpuhp_tasks_frozen) {
+ cpuset_update_active_cpus();
+ } else {
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ }
+ return 0;
+}
+
+int sched_cpu_activate(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When going up, increment the number of cores with SMT present.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_inc_cpuslocked(&sched_smt_present);
+#endif
+ set_cpu_active(cpu, true);
+
+ if (sched_smp_initialized) {
+ sched_domains_numa_masks_set(cpu);
+ cpuset_cpu_active();
+ }
+
+ /*
+ * Put the rq online, if not already. This happens:
+ *
+ * 1) In the early boot process, because we build the real domains
+ * after all CPUs have been brought up.
+ *
+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+ * domains.
+ */
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_online(rq);
+ }
+ unbind_zero(cpu);
+ rq_unlock_irqrestore(rq, &rf);
+
+ return 0;
+}
+
+int sched_cpu_deactivate(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ int ret;
+
+ set_cpu_active(cpu, false);
+ /*
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+ * users of this state to go away such that all new such users will
+ * observe it.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+ synchronize_rcu();
+
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ update_rq_clock(rq);
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When going down, decrement the number of cores with SMT present.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_dec_cpuslocked(&sched_smt_present);
+#endif
+
+ if (!sched_smp_initialized)
+ return 0;
+
+ ret = cpuset_cpu_inactive(cpu);
+ if (ret) {
+ set_cpu_active(cpu, true);
+ return ret;
+ }
+ sched_domains_numa_masks_clear(cpu);
+ return 0;
+}
+
+int sched_cpu_starting(unsigned int cpu)
+{
+ sched_tick_start(cpu);
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_wait_empty(unsigned int __always_unused cpu)
+{
+ return 0;
+}
+
+int sched_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /* Handle pending wakeups and then migrate everything off */
+ sched_tick_stop(cpu);
+
+ local_irq_save(flags);
+ double_rq_lock(rq, cpu_rq(0));
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ bind_zero(cpu);
+ double_rq_unlock(rq, cpu_rq(0));
+ sched_start_tick(rq, cpu);
+ hrexpiry_clear(rq);
+ local_irq_restore(flags);
+
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+/*
+ * Cheaper version of the below functions in case support for SMT and MC is
+ * compiled in but CPUs have no siblings.
+ */
+static bool sole_cpu_idle(struct rq *rq)
+{
+ return rq_idle(rq);
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static const cpumask_t *thread_cpumask(int cpu)
+{
+ return topology_sibling_cpumask(cpu);
+}
+/* All this CPU's SMT siblings are idle */
+static bool siblings_cpu_idle(struct rq *rq)
+{
+ return cpumask_subset(&rq->thread_mask, &cpu_idle_map);
+}
+#endif
+#ifdef CONFIG_SCHED_MC
+static const cpumask_t *core_cpumask(int cpu)
+{
+ return topology_core_cpumask(cpu);
+}
+/* All this CPU's shared cache siblings are idle */
+static bool cache_cpu_idle(struct rq *rq)
+{
+ return cpumask_subset(&rq->core_mask, &cpu_idle_map);
+}
+/* MC siblings CPU mask which share the same LLC */
+static const cpumask_t *llc_core_cpumask(int cpu)
+{
+#ifdef CONFIG_X86
+ return per_cpu(cpu_llc_shared_map, cpu);
+#else
+ return topology_core_cpumask(cpu);
+#endif
+}
+#endif
+
+enum sched_domain_level {
+ SD_LV_NONE = 0,
+ SD_LV_SIBLING,
+ SD_LV_MC,
+ SD_LV_BOOK,
+ SD_LV_CPU,
+ SD_LV_NODE,
+ SD_LV_ALLNODES,
+ SD_LV_MAX
+};
+
+/*
+ * Set up the relative cache distance of each online cpu from each
+ * other in a simple array for quick lookup. Locality is determined
+ * by the closest sched_domain that CPUs are separated by. CPUs with
+ * shared cache in SMT and MC are treated as local. Separate CPUs
+ * (within the same package or physically) within the same node are
+ * treated as not local. CPUs not even in the same domain (different
+ * nodes) are treated as very distant.
+ */
+static void __init select_leaders(void)
+{
+ struct rq *rq, *other_rq, *leader;
+ struct sched_domain *sd;
+ int cpu, other_cpu;
+#ifdef CONFIG_SCHED_SMT
+ bool smt_threads = false;
+#endif
+
+ for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+ rq = cpu_rq(cpu);
+ leader = NULL;
+ /* First check if this cpu is in the same node */
+ for_each_domain(cpu, sd) {
+ if (sd->level > SD_LV_MC)
+ continue;
+ if (rqshare != RQSHARE_ALL)
+ leader = NULL;
+ /* Set locality to local node if not already found lower */
+ for_each_cpu(other_cpu, sched_domain_span(sd)) {
+ if (rqshare >= RQSHARE_SMP) {
+ other_rq = cpu_rq(other_cpu);
+
+ /* Set the smp_leader to the first CPU */
+ if (!leader)
+ leader = rq;
+ if (!other_rq->smp_leader)
+ other_rq->smp_leader = leader;
+ }
+ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP)
+ rq->cpu_locality[other_cpu] = LOCALITY_SMP;
+ }
+ }
+
+ /*
+ * Each runqueue has its own function in case it doesn't have
+ * siblings of its own allowing mixed topologies.
+ */
+#ifdef CONFIG_SCHED_MC
+ leader = NULL;
+ if (cpumask_weight(core_cpumask(cpu)) > 1) {
+ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu));
+ cpumask_clear_cpu(cpu, &rq->core_mask);
+ for_each_cpu(other_cpu, core_cpumask(cpu)) {
+ if (rqshare == RQSHARE_MC ||
+ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) {
+ other_rq = cpu_rq(other_cpu);
+
+ /* Set the mc_leader to the first CPU */
+ if (!leader)
+ leader = rq;
+ if (!other_rq->mc_leader)
+ other_rq->mc_leader = leader;
+ }
+ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) {
+ /* this is to get LLC into play even in case LLC sharing is not used */
+ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))
+ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC;
+ else
+ rq->cpu_locality[other_cpu] = LOCALITY_MC;
+ }
+ }
+ rq->cache_idle = cache_cpu_idle;
+ }
+#endif
+#ifdef CONFIG_SCHED_SMT
+ leader = NULL;
+ if (cpumask_weight(thread_cpumask(cpu)) > 1) {
+ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu));
+ cpumask_clear_cpu(cpu, &rq->thread_mask);
+ for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+ if (rqshare == RQSHARE_SMT) {
+ other_rq = cpu_rq(other_cpu);
+
+ /* Set the smt_leader to the first CPU */
+ if (!leader)
+ leader = rq;
+ if (!other_rq->smt_leader)
+ other_rq->smt_leader = leader;
+ }
+ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT)
+ rq->cpu_locality[other_cpu] = LOCALITY_SMT;
+ }
+ rq->siblings_idle = siblings_cpu_idle;
+ smt_threads = true;
+ }
+#endif
+ }
+
+#ifdef CONFIG_SMT_NICE
+ if (smt_threads) {
+ check_siblings = &check_smt_siblings;
+ wake_siblings = &wake_smt_siblings;
+ smt_schedule = &smt_should_schedule;
+ }
+#endif
+
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+ for_each_online_cpu(other_cpu) {
+ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
+ }
+ }
+}
+
+/* FIXME freeing locked spinlock */
+static void __init share_and_free_rq(struct rq *leader, struct rq *rq)
+{
+ WARN_ON(rq->nr_running > 0);
+
+ kfree(rq->node);
+ kfree(rq->sl);
+ kfree(rq->lock);
+ rq->node = leader->node;
+ rq->sl = leader->sl;
+ rq->lock = leader->lock;
+ rq->is_leader = false;
+ barrier();
+ /* To make up for not unlocking the freed runlock */
+ preempt_enable();
+}
+
+static void __init share_rqs(void)
+{
+ struct rq *rq, *leader;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+ leader = rq->smp_leader;
+
+ rq_lock(rq);
+ if (leader && rq != leader) {
+ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n",
+ leader->cpu, rq->cpu);
+ share_and_free_rq(leader, rq);
+ } else
+ rq_unlock(rq);
+ }
+
+#ifdef CONFIG_SCHED_MC
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+ leader = rq->mc_leader;
+
+ rq_lock(rq);
+ if (leader && rq != leader) {
+ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n",
+ leader->cpu, rq->cpu);
+ share_and_free_rq(leader, rq);
+ } else
+ rq_unlock(rq);
+ }
+#endif /* CONFIG_SCHED_MC */
+
+#ifdef CONFIG_SCHED_SMT
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+ leader = rq->smt_leader;
+
+ rq_lock(rq);
+ if (leader && rq != leader) {
+ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n",
+ leader->cpu, rq->cpu);
+ share_and_free_rq(leader, rq);
+ } else
+ rq_unlock(rq);
+ }
+#endif /* CONFIG_SCHED_SMT */
+}
+
+static void __init setup_rq_orders(void)
+{
+ int *selected_cpus, *ordered_cpus;
+ struct rq *rq, *other_rq;
+ int cpu, other_cpu, i;
+
+ selected_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC);
+ ordered_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC);
+
+ total_runqueues = 0;
+ for_each_online_cpu(cpu) {
+ int locality, total_rqs = 0, total_cpus = 0;
+
+ rq = cpu_rq(cpu);
+ if (rq->is_leader)
+ total_runqueues++;
+
+ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) {
+ int selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu;
+ int ordered_cpus_idx;
+
+ ordered_cpus_idx = -1;
+ selected_cpu_cnt = 0;
+
+ for_each_online_cpu(test_cpu) {
+ if (cpu < num_online_cpus() / 2)
+ other_cpu = cpu + test_cpu;
+ else
+ other_cpu = cpu - test_cpu;
+ if (other_cpu < 0)
+ other_cpu += num_online_cpus();
+ else
+ other_cpu %= num_online_cpus();
+ /* gather CPUs of the same locality */
+ if (rq->cpu_locality[other_cpu] == locality) {
+ selected_cpus[selected_cpu_cnt] = other_cpu;
+ selected_cpu_cnt++;
+ }
+ }
+
+ /* reserve first CPU as starting point */
+ if (selected_cpu_cnt > 0) {
+ ordered_cpus_idx++;
+ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx];
+ selected_cpus[ordered_cpus_idx] = -1;
+ }
+
+ /* take each CPU and sort it within the same locality based on each inter-CPU localities */
+ for (test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) {
+ /* starting point with worst locality and current CPU */
+ best_locality = LOCALITY_DISTANT;
+ selected_cpu_idx = test_cpu_idx;
+
+ /* try to find the best locality within group */
+ for (cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) {
+ /* if CPU has not been used and locality is better */
+ if (selected_cpus[cpu_idx] > -1) {
+ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]);
+ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) {
+ /* assign best locality and best CPU idx in array */
+ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]];
+ selected_cpu_idx = cpu_idx;
+ }
+ }
+ }
+
+ /* add our next best CPU to ordered list */
+ ordered_cpus_idx++;
+ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx];
+ /* mark this CPU as used */
+ selected_cpus[selected_cpu_idx] = -1;
+ }
+
+ /* set up RQ and CPU orders */
+ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) {
+ other_rq = cpu_rq(ordered_cpus[test_cpu]);
+ /* set up cpu orders */
+ rq->cpu_order[total_cpus++] = other_rq;
+ if (other_rq->is_leader) {
+ /* set up RQ orders */
+ rq->rq_order[total_rqs++] = other_rq;
+ }
+ }
+ }
+ }
+
+ kfree(selected_cpus);
+ kfree(ordered_cpus);
+
+#ifdef CONFIG_X86
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+ for (i = 0; i < total_runqueues; i++) {
+ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i,
+ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu));
+ }
+ }
+
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+ for (i = 0; i < num_online_cpus(); i++) {
+ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i,
+ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu));
+ }
+ }
+#endif
+}
+
+void __init sched_init_smp(void)
+{
+ sched_init_numa();
+
+ /*
+ * There's no userspace yet to cause hotplug operations; hence all the
+ * cpu masks are stable and all blatant races in the below code cannot
+ * happen.
+ */
+ mutex_lock(&sched_domains_mutex);
+ sched_init_domains(cpu_active_mask);
+ mutex_unlock(&sched_domains_mutex);
+
+ /* Move init over to a non-isolated CPU */
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
+ BUG();
+
+ local_irq_disable();
+ mutex_lock(&sched_domains_mutex);
+ lock_all_rqs();
+
+ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n",
+ num_possible_cpus(), num_present_cpus(), num_online_cpus());
+
+ select_leaders();
+
+ unlock_all_rqs();
+ mutex_unlock(&sched_domains_mutex);
+
+ share_rqs();
+
+ local_irq_enable();
+
+ setup_rq_orders();
+
+ switch (rqshare) {
+ case RQSHARE_ALL:
+ /* This should only ever read 1 */
+ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n",
+ total_runqueues);
+ break;
+ case RQSHARE_SMP:
+ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n",
+ total_runqueues);
+ break;
+ case RQSHARE_MC:
+ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n",
+ total_runqueues);
+ break;
+ case RQSHARE_MC_LLC:
+ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n",
+ total_runqueues);
+ break;
+ case RQSHARE_SMT:
+ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n",
+ total_runqueues);
+ break;
+ case RQSHARE_NONE:
+ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n",
+ total_runqueues);
+ break;
+ }
+
+ sched_smp_initialized = true;
+}
+#else
+void __init sched_init_smp(void)
+{
+ sched_smp_initialized = true;
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+ return in_lock_functions(addr) ||
+ (addr >= (unsigned long)__sched_text_start
+ && addr < (unsigned long)__sched_text_end);
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+/* task group related information */
+struct task_group {
+ struct cgroup_subsys_state css;
+
+ struct rcu_head rcu;
+ struct list_head list;
+
+ struct task_group *parent;
+ struct list_head siblings;
+ struct list_head children;
+};
+
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
+struct task_group root_task_group;
+LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
+#endif /* CONFIG_CGROUP_SCHED */
+
+void __init sched_init(void)
+{
+#ifdef CONFIG_SMP
+ int cpu_ids;
+#endif
+ int i;
+ struct rq *rq;
+
+ wait_bit_init();
+
+ prio_ratios[0] = 128;
+ for (i = 1 ; i < NICE_WIDTH ; i++)
+ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
+
+ skiplist_node_init(&init_task.node);
+
+#ifdef CONFIG_SMP
+ init_defrootdomain();
+ cpumask_clear(&cpu_idle_map);
+#else
+ uprq = &per_cpu(runqueues, 0);
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
+ list_add(&root_task_group.list, &task_groups);
+ INIT_LIST_HEAD(&root_task_group.children);
+ INIT_LIST_HEAD(&root_task_group.siblings);
+#endif /* CONFIG_CGROUP_SCHED */
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC);
+ skiplist_init(rq->node);
+ rq->sl = new_skiplist(rq->node);
+ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC);
+ raw_spin_lock_init(rq->lock);
+ rq->nr_running = 0;
+ rq->nr_uninterruptible = 0;
+ rq->nr_switches = 0;
+ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0;
+ rq->last_jiffy = jiffies;
+ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns =
+ rq->iowait_ns = rq->idle_ns = 0;
+ rq->dither = 0;
+ set_rq_task(rq, &init_task);
+ rq->iso_ticks = 0;
+ rq->iso_refractory = false;
+#ifdef CONFIG_SMP
+ rq->is_leader = true;
+ rq->smp_leader = NULL;
+#ifdef CONFIG_SCHED_MC
+ rq->mc_leader = NULL;
+#endif
+#ifdef CONFIG_SCHED_SMT
+ rq->smt_leader = NULL;
+#endif
+ rq->sd = NULL;
+ rq->rd = NULL;
+ rq->online = false;
+ rq->cpu = i;
+ rq_attach_root(rq, &def_root_domain);
+#endif /* CONFIG_SMP */
+ init_rq_hrexpiry(rq);
+ atomic_set(&rq->nr_iowait, 0);
+ }
+
+#ifdef CONFIG_SMP
+ cpu_ids = i;
+ /*
+ * Set the base locality for cpu cache distance calculation to
+ * "distant" (3). Make sure the distance from a CPU to itself is 0.
+ */
+ for_each_possible_cpu(i) {
+ int j;
+
+ rq = cpu_rq(i);
+#ifdef CONFIG_SCHED_SMT
+ rq->siblings_idle = sole_cpu_idle;
+#endif
+#ifdef CONFIG_SCHED_MC
+ rq->cache_idle = sole_cpu_idle;
+#endif
+ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC);
+ for_each_possible_cpu(j) {
+ if (i == j)
+ rq->cpu_locality[j] = LOCALITY_SAME;
+ else
+ rq->cpu_locality[j] = LOCALITY_DISTANT;
+ }
+ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
+ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
+ rq->rq_order[0] = rq->cpu_order[0] = rq;
+ for (j = 1; j < cpu_ids; j++)
+ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j);
+ }
+#endif
+
+ /*
+ * The boot idle thread does lazy MMU switching as well:
+ */
+ mmgrab(&init_mm);
+ enter_lazy_tlb(&init_mm, current);
+
+ /*
+ * Make us the idle thread. Technically, schedule() should not be
+ * called from this thread, however somewhere below it might be,
+ * but because we are the idle thread, we just pick up running again
+ * when this runqueue becomes "idle".
+ */
+ init_idle(current, smp_processor_id());
+
+#ifdef CONFIG_SMP
+ idle_thread_set_boot_cpu();
+#endif /* SMP */
+
+ init_schedstats();
+
+ psi_init();
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+ int nested = preempt_count() + rcu_preempt_depth();
+
+ return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+ /*
+ * Blocking primitives will set (and therefore destroy) current->state,
+ * since we will exit with TASK_RUNNING make sure we enter with it,
+ * otherwise we will destroy state.
+ */
+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+ "do not call blocking ops when !TASK_RUNNING; "
+ "state=%lx set at [<%p>] %pS\n",
+ current->state,
+ (void *)current->task_state_change,
+ (void *)current->task_state_change);
+
+ ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void __cant_sleep(const char *file, int line, int preempt_offset)
+{
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > preempt_offset)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
+ /* Ratelimiting timestamp: */
+ static unsigned long prev_jiffy;
+
+ unsigned long preempt_disable_ip;
+
+ /* WARN_ON_ONCE() by default, no rate limit required: */
+ rcu_sleep_check();
+
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current) && !current->non_block_count) ||
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ /* Save this before calling printk(), since that will clobber it: */
+ preempt_disable_ip = get_preempt_disable_ip(current);
+
+ printk(KERN_ERR
+ "BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ printk(KERN_ERR
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
+ current->pid, current->comm);
+
+ if (task_stack_end_corrupted(current))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && !preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
+ }
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL(___might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static inline void normalise_rt_tasks(void)
+{
+ struct sched_attr attr = {};
+ struct task_struct *g, *p;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ /*
+ * Only normalize user tasks:
+ */
+ if (p->flags & PF_KTHREAD)
+ continue;
+
+ if (!rt_task(p) && !iso_task(p))
+ continue;
+
+ rq = task_rq_lock(p, &rf);
+ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false);
+ task_rq_unlock(rq, p, &rf);
+ }
+ read_unlock(&tasklist_lock);
+}
+
+void normalize_rt_tasks(void)
+{
+ normalise_rt_tasks();
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given CPU.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
+ */
+struct task_struct *curr_task(int cpu)
+{
+ return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * ia64_set_curr_task - set the current task for a given CPU.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack. It allows the architecture to switch the
+ * notion of the current task on a CPU in a non-blocking manner. This function
+ * must be called with all CPU's synchronised, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void ia64_set_curr_task(int cpu, struct task_struct *p)
+{
+ cpu_curr(cpu) = p;
+}
+
+#endif
+
+void init_idle_bootup_task(struct task_struct *idle)
+{}
+
+#ifdef CONFIG_SCHED_DEBUG
+__read_mostly bool sched_debug_enabled;
+
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ struct seq_file *m)
+{
+ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
+ get_nr_threads(p));
+}
+
+void proc_sched_set_task(struct task_struct *p)
+{}
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+static void sched_free_group(struct task_group *tg)
+{
+ kmem_cache_free(task_group_cache, tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(struct task_group *parent)
+{
+ struct task_group *tg;
+
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ return tg;
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void sched_free_group_rcu(struct rcu_head *rhp)
+{
+ /* Now it should be safe to free those cfs_rqs */
+ sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+void sched_destroy_group(struct task_group *tg)
+{
+ /* Wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
+}
+
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct task_group, css) : NULL;
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
+
+ if (!parent) {
+ /* This is early initialization for the top cgroup */
+ return &root_task_group.css;
+ }
+
+ tg = sched_create_group(parent);
+ if (IS_ERR(tg))
+ return ERR_PTR(-ENOMEM);
+ return &tg->css;
+}
+
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ sched_offline_group(tg);
+}
+
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_free_group(tg);
+}
+
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+ return 0;
+}
+
+static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+
+static struct cftype cpu_legacy_files[] = {
+ { } /* Terminate */
+};
+
+static struct cftype cpu_files[] = {
+ { } /* terminate */
+};
+
+static int cpu_extra_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+ return 0;
+}
+
+struct cgroup_subsys cpu_cgrp_subsys = {
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
+ .css_released = cpu_cgroup_css_released,
+ .css_free = cpu_cgroup_css_free,
+ .css_extra_stat_show = cpu_extra_stat_show,
+ .fork = cpu_cgroup_fork,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
+ .legacy_cftypes = cpu_files,
+ .legacy_cftypes = cpu_legacy_files,
+ .dfl_cftypes = cpu_files,
+ .early_init = true,
+ .threaded = true,
+};
+#endif /* CONFIG_CGROUP_SCHED */
+
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
+{
+ trace_sched_update_nr_running_tp(rq, count);
+}
+
+/* CFS Compat */
+#ifdef CONFIG_RCU_TORTURE_TEST
+int sysctl_sched_rt_runtime;
+#endif
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
new file mode 100644
index 00000000000000..e8b42add3a7fe0
--- /dev/null
+++ b/kernel/sched/MuQSS.h
@@ -0,0 +1,1076 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MUQSS_SCHED_H
+#define MUQSS_SCHED_H
+
+#include <linux/sched/clock.h>
+#include <linux/sched/cpufreq.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/init.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/smt.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/wake_q.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include <linux/cgroup.h>
+#include <linux/cpufreq.h>
+#include <linux/cpuidle.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/freezer.h>
+#include <linux/kernel_stat.h>
+#include <linux/kthread.h>
+#include <linux/membarrier.h>
+#include <linux/livepatch.h>
+#include <linux/proc_fs.h>
+#include <linux/psi.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/skip_list.h>
+#include <linux/stop_machine.h>
+#include <linux/suspend.h>
+#include <linux/swait.h>
+#include <linux/syscalls.h>
+#include <linux/tick.h>
+#include <linux/tsacct_kern.h>
+#include <linux/u64_stats_sync.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
+#include "cpupri.h"
+
+#include <trace/events/sched.h>
+
+#ifdef CONFIG_SCHED_DEBUG
+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+#else
+# define SCHED_WARN_ON(x) ((void)(x))
+#endif
+
+/* Wake flags. The first three directly map to some SD flag value */
+#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
+
+#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
+
+#ifdef CONFIG_SMP
+static_assert(WF_EXEC == SD_BALANCE_EXEC);
+static_assert(WF_FORK == SD_BALANCE_FORK);
+static_assert(WF_TTWU == SD_BALANCE_WAKE);
+#endif
+
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED 1
+#define TASK_ON_RQ_MIGRATING 2
+
+extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
+
+struct rq;
+
+#ifdef CONFIG_SMP
+
+static inline bool sched_asym_prefer(int a, int b)
+{
+ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
+}
+
+struct perf_domain {
+ struct em_perf_domain *em_pd;
+ struct perf_domain *next;
+ struct rcu_head rcu;
+};
+
+/* Scheduling group status flags */
+#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */
+#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+ atomic_t refcount;
+ atomic_t rto_count;
+ struct rcu_head rcu;
+ cpumask_var_t span;
+ cpumask_var_t online;
+
+ /*
+ * Indicate pullable load on at least one CPU, e.g:
+ * - More than one runnable task
+ * - Running task is misfit
+ */
+ int overload;
+
+ /* Indicate one or more cpus over-utilized (tipping point) */
+ int overutilized;
+
+ /*
+ * The bit corresponding to a CPU gets set here if such CPU has more
+ * than one runnable -deadline task (as it is below for RT tasks).
+ */
+ cpumask_var_t dlo_mask;
+ atomic_t dlo_count;
+
+ /* Replace unused CFS structures with void */
+ //struct dl_bw dl_bw;
+ //struct cpudl cpudl;
+ void *dl_bw;
+ void *cpudl;
+ u64 visit_gen;
+
+ /*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+ */
+ cpumask_var_t rto_mask;
+ //struct cpupri cpupri;
+ void *cpupri;
+
+ unsigned long max_cpu_capacity;
+
+ /*
+ * NULL-terminated list of performance domains intersecting with the
+ * CPUs of the rd. Protected by RCU.
+ */
+ struct perf_domain *pd;
+};
+
+extern void init_defrootdomain(void);
+extern int sched_init_domains(const struct cpumask *cpu_map);
+extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
+
+static inline void cpupri_cleanup(void __maybe_unused *cpupri)
+{
+}
+
+static inline void cpudl_cleanup(void __maybe_unused *cpudl)
+{
+}
+
+static inline void init_dl_bw(void __maybe_unused *dl_bw)
+{
+}
+
+static inline int cpudl_init(void __maybe_unused *dl_bw)
+{
+ return 0;
+}
+
+static inline int cpupri_init(void __maybe_unused *cpupri)
+{
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ * This data should only be modified by the local cpu.
+ */
+struct rq {
+ raw_spinlock_t *lock;
+ raw_spinlock_t *orig_lock;
+
+ struct task_struct __rcu *curr;
+ struct task_struct *idle;
+ struct task_struct *stop;
+ struct mm_struct *prev_mm;
+
+ unsigned int nr_running;
+ /*
+ * This is part of a global counter where only the total sum
+ * over all CPUs matters. A task can increase this counter on
+ * one CPU and if it got migrated afterwards it may decrease
+ * it on another CPU. Always updated under the runqueue lock:
+ */
+ unsigned long nr_uninterruptible;
+#ifdef CONFIG_SMP
+ unsigned int ttwu_pending;
+#endif
+ u64 nr_switches;
+
+ /* Stored data about rq->curr to work outside rq lock */
+ u64 rq_deadline;
+ int rq_prio;
+
+ /* Best queued id for use outside lock */
+ u64 best_key;
+
+ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */
+ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */
+ u64 niffies; /* Last time this RQ updated rq clock */
+ u64 last_niffy; /* Last niffies as updated by local clock */
+ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */
+
+ u64 load_update; /* When we last updated load */
+ unsigned long load_avg; /* Rolling load average */
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+ u64 irq_load_update; /* When we last updated IRQ load */
+ unsigned long irq_load_avg; /* Rolling IRQ load average */
+#endif
+#ifdef CONFIG_SMT_NICE
+ struct mm_struct *rq_mm;
+ int rq_smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+ /* Accurate timekeeping data */
+ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns,
+ iowait_ns, idle_ns;
+ atomic_t nr_iowait;
+
+#ifdef CONFIG_MEMBARRIER
+ int membarrier_state;
+#endif
+
+ skiplist_node *node;
+ skiplist *sl;
+#ifdef CONFIG_SMP
+ struct task_struct *preempt; /* Preempt triggered on this task */
+ struct task_struct *preempting; /* Hint only, what task is preempting */
+
+ int cpu; /* cpu of this runqueue */
+ bool online;
+
+ struct root_domain *rd;
+ struct sched_domain *sd;
+
+ unsigned long cpu_capacity_orig;
+
+ int *cpu_locality; /* CPU relative cache distance */
+ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */
+ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */
+
+ bool is_leader;
+ struct rq *smp_leader; /* First physical CPU per node */
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+ struct sched_avg avg_thermal;
+#endif /* CONFIG_SCHED_THERMAL_PRESSURE */
+#ifdef CONFIG_SCHED_SMT
+ struct rq *smt_leader; /* First logical CPU in SMT siblings */
+ cpumask_t thread_mask;
+ bool (*siblings_idle)(struct rq *rq);
+ /* See if all smt siblings are idle */
+#endif /* CONFIG_SCHED_SMT */
+#ifdef CONFIG_SCHED_MC
+ struct rq *mc_leader; /* First logical CPU in MC siblings */
+ cpumask_t core_mask;
+ bool (*cache_idle)(struct rq *rq);
+ /* See if all cache siblings are idle */
+#endif /* CONFIG_SCHED_MC */
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_PARAVIRT
+ u64 prev_steal_time;
+#endif /* CONFIG_PARAVIRT */
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time_rq;
+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
+
+ u64 clock, old_clock, last_tick;
+ /* Ensure that all clocks are in the same cache line */
+ u64 clock_task ____cacheline_aligned;
+ int dither;
+
+ int iso_ticks;
+ bool iso_refractory;
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+ struct hrtimer hrexpiry_timer;
+#endif
+
+ int rt_nr_running; /* Number real time tasks running */
+#ifdef CONFIG_SCHEDSTATS
+
+ /* latency stats */
+ struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+ /* sys_sched_yield() stats */
+ unsigned int yld_count;
+
+ /* schedule() stats */
+ unsigned int sched_switch;
+ unsigned int sched_count;
+ unsigned int sched_goidle;
+
+ /* try_to_wake_up() stats */
+ unsigned int ttwu_count;
+ unsigned int ttwu_local;
+#endif /* CONFIG_SCHEDSTATS */
+
+#ifdef CONFIG_CPU_IDLE
+ /* Must be inspected within a rcu lock section */
+ struct cpuidle_state *idle_state;
+#endif
+};
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return READ_ONCE(rq->clock);
+}
+
+static inline u64 rq_clock(struct rq *rq)
+{
+ lockdep_assert_held(rq->lock);
+
+ return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+ lockdep_assert_held(rq->lock);
+
+ return rq->clock_task;
+}
+
+/**
+ * By default the decay is the default pelt decay period.
+ * The decay shift can change the decay period in
+ * multiples of 32.
+ * Decay shift Decay period(ms)
+ * 0 32
+ * 1 64
+ * 2 128
+ * 3 256
+ * 4 512
+ */
+extern int sched_thermal_decay_shift;
+
+static inline u64 rq_clock_thermal(struct rq *rq)
+{
+ return rq_clock_task(rq) >> sched_thermal_decay_shift;
+}
+
+struct rq_flags {
+ unsigned long flags;
+};
+
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu);
+#endif
+
+#ifndef CONFIG_SMP
+extern struct rq *uprq;
+#define cpu_rq(cpu) (uprq)
+#define this_rq() (uprq)
+#define raw_rq() (uprq)
+#define task_rq(p) (uprq)
+#define cpu_curr(cpu) ((uprq)->curr)
+#else /* CONFIG_SMP */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#define this_rq() this_cpu_ptr(&runqueues)
+#define raw_rq() raw_cpu_ptr(&runqueues)
+#define task_rq(p) cpu_rq(task_cpu(p))
+#endif /* CONFIG_SMP */
+
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+ return rq->curr == p;
+}
+
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+ return p->on_cpu;
+#else
+ return task_current(rq, p);
+#endif
+}
+
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
+}
+
+static inline void rq_lock(struct rq *rq)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(rq->lock);
+}
+
+static inline void rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock(rq->lock);
+}
+
+static inline void rq_lock_irq(struct rq *rq)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irq(rq->lock);
+}
+
+static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf)
+ __releases(rq->lock)
+{
+ raw_spin_unlock_irq(rq->lock);
+}
+
+static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irqsave(rq->lock, rf->flags);
+}
+
+static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ raw_spin_unlock_irqrestore(rq->lock, rf->flags);
+}
+
+static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ while (42) {
+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
+ rq = task_rq(p);
+ raw_spin_lock(rq->lock);
+ if (likely(rq == task_rq(p)))
+ break;
+ raw_spin_unlock(rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+ }
+ return rq;
+}
+
+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+}
+
+static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ while (42) {
+ rq = task_rq(p);
+ raw_spin_lock(rq->lock);
+ if (likely(rq == task_rq(p)))
+ break;
+ raw_spin_unlock(rq->lock);
+ }
+ return rq;
+}
+
+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf)
+{
+ rq_unlock(rq);
+}
+
+static inline struct rq *
+this_rq_lock_irq(struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ rq_lock(rq);
+ return rq;
+}
+
+/*
+ * {de,en}queue flags: Most not used on MuQSS.
+ *
+ * DEQUEUE_SLEEP - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ * are in a known state which allows modification. Such pairs
+ * should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ * in the runqueue.
+ *
+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_MIGRATED - the task was migrated during wakeup
+ *
+ */
+
+#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
+
+#define ENQUEUE_WAKEUP 0x01
+#define ENQUEUE_RESTORE 0x02
+
+#ifdef CONFIG_SMP
+#define ENQUEUE_MIGRATED 0x40
+#else
+#define ENQUEUE_MIGRATED 0x00
+#endif
+
+#ifdef CONFIG_NUMA
+enum numa_topology_type {
+ NUMA_DIRECT,
+ NUMA_GLUELESS_MESH,
+ NUMA_BACKPLANE,
+};
+extern enum numa_topology_type sched_numa_topology_type;
+extern int sched_max_numa_distance;
+extern bool find_numa_distance(int distance);
+extern void sched_init_numa(void);
+extern void sched_domains_numa_masks_set(unsigned int cpu);
+extern void sched_domains_numa_masks_clear(unsigned int cpu);
+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
+#else
+static inline void sched_init_numa(void) { }
+static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ return nr_cpu_ids;
+}
+#endif
+
+extern struct mutex sched_domains_mutex;
+extern struct static_key_false sched_schedstats;
+
+#define rcu_dereference_check_sched_domain(p) \
+ rcu_dereference_check((p), \
+ lockdep_is_held(&sched_domains_mutex))
+
+#define SCA_CHECK 0x01
+
+#ifdef CONFIG_SMP
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See destroy_sched_domains: call_rcu for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ __sd; __sd = __sd->parent)
+
+/**
+ * highest_flag_domain - Return highest sched_domain containing flag.
+ * @cpu: The cpu whose highest level of sched domain is to
+ * be returned.
+ * @flag: The flag to check for the highest sched_domain
+ * for the given cpu.
+ *
+ * Returns the highest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd, *hsd = NULL;
+
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & flag))
+ break;
+ hsd = sd;
+ }
+
+ return hsd;
+}
+
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd) {
+ if (sd->flags & flag)
+ break;
+ }
+
+ return sd;
+}
+
+DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_size);
+DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+
+struct sched_group_capacity {
+ atomic_t ref;
+ /*
+ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
+ * for a single CPU.
+ */
+ unsigned long capacity;
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
+ unsigned long max_capacity; /* Max per-CPU capacity in group */
+ unsigned long next_update;
+ int imbalance; /* XXX unrelated to capacity but shared group state */
+
+#ifdef CONFIG_SCHED_DEBUG
+ int id;
+#endif
+
+ unsigned long cpumask[]; /* balance mask */
+};
+
+struct sched_group {
+ struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
+
+ unsigned int group_weight;
+ struct sched_group_capacity *sgc;
+ int asym_prefer_cpu; /* cpu of highest priority in group */
+
+ /*
+ * The CPUs this group covers.
+ *
+ * NOTE: this field is variable length. (Allocated dynamically
+ * by attaching extra space to the end of the structure,
+ * depending on how many CPUs the kernel has booted up with)
+ */
+ unsigned long cpumask[0];
+};
+
+static inline struct cpumask *sched_group_span(struct sched_group *sg)
+{
+ return to_cpumask(sg->cpumask);
+}
+
+/*
+ * See build_balance_mask().
+ */
+static inline struct cpumask *group_balance_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgc->cpumask);
+}
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+ return cpumask_first(sched_group_span(group));
+}
+
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void dirty_sched_domain_sysctl(int cpu);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void dirty_sched_domain_sysctl(int cpu)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+extern void flush_smp_call_function_from_idle(void);
+
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+extern void set_rq_online (struct rq *rq);
+extern void set_rq_offline(struct rq *rq);
+extern bool sched_smp_initialized;
+
+static inline void update_group_capacity(struct sched_domain *sd, int cpu)
+{
+}
+
+static inline void trigger_load_balance(struct rq *rq)
+{
+}
+
+#else /* CONFIG_SMP */
+
+static inline void flush_smp_call_function_from_idle(void) { }
+
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+ rq->idle_state = idle_state;
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ SCHED_WARN_ON(!rcu_read_lock_held());
+ return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+extern bool sched_debug_enabled;
+#endif
+
+extern void schedule_idle(void);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+struct irqtime {
+ u64 total;
+ u64 tick_delta;
+ u64 irq_start_time;
+ struct u64_stats_sync sync;
+};
+
+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
+static inline u64 irq_time_read(int cpu)
+{
+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+ unsigned int seq;
+ u64 total;
+
+ do {
+ seq = __u64_stats_fetch_begin(&irqtime->sync);
+ total = irqtime->total;
+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
+
+ return total;
+}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline bool sched_stop_runnable(struct rq *rq)
+{
+ return rq->stop && task_on_rq_queued(rq->stop);
+}
+
+#ifdef CONFIG_SMP
+static inline int cpu_of(struct rq *rq)
+{
+ return rq->cpu;
+}
+#else /* CONFIG_SMP */
+static inline int cpu_of(struct rq *rq)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+static inline void cpufreq_trigger(struct rq *rq, unsigned int flags)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+ cpu_of(rq)));
+
+ if (data)
+ data->func(data, rq->niffies, flags);
+}
+#else
+static inline void cpufreq_trigger(struct rq *rq, unsigned int flag)
+{
+}
+#endif /* CONFIG_CPU_FREQ */
+
+static __always_inline
+unsigned int uclamp_rq_util_with(struct rq __maybe_unused *rq, unsigned int util,
+ struct task_struct __maybe_unused *p)
+{
+ return util;
+}
+
+static inline bool uclamp_is_used(void)
+{
+ return false;
+}
+
+#ifndef arch_scale_freq_tick
+static __always_inline
+void arch_scale_freq_tick(void)
+{
+}
+#endif
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant() (true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant() (false)
+#endif
+
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ return tsk_seruntime(t);
+}
+#else
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ struct rq_flags rf;
+ u64 ns;
+ struct rq *rq;
+
+ rq = task_rq_lock(t, &rf);
+ ns = tsk_seruntime(t);
+ task_rq_unlock(rq, t, &rf);
+
+ return ns;
+}
+#endif
+
+#ifndef arch_scale_freq_capacity
+/**
+ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU.
+ * @cpu: the CPU in question.
+ *
+ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
+ *
+ * f_curr
+ * ------ * SCHED_CAPACITY_SCALE
+ * f_max
+ */
+static __always_inline
+unsigned long arch_scale_freq_capacity(int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+#ifdef CONFIG_NO_HZ_FULL
+extern bool sched_can_stop_tick(struct rq *rq);
+extern int __init sched_tick_offload_init(void);
+
+/*
+ * Tick may be needed by tasks in the runqueue depending on their policy and
+ * requirements. If tick is needed, lets send the target an IPI to kick it out of
+ * nohz mode if necessary.
+ */
+static inline void sched_update_tick_dependency(struct rq *rq)
+{
+ int cpu = cpu_of(rq);
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (sched_can_stop_tick(rq))
+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+ else
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#else
+static inline int sched_tick_offload_init(void) { return 0; }
+static inline void sched_update_tick_dependency(struct rq *rq) { }
+#endif
+
+#define SCHED_FLAG_SUGOV 0x10000000
+
+#ifdef CONFIG_SMP
+/**
+ * enum cpu_util_type - CPU utilization type
+ * @FREQUENCY_UTIL: Utilization used to select frequency
+ * @ENERGY_UTIL: Utilization used during energy calculation
+ *
+ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
+ * need to be aggregated differently depending on the usage made of them. This
+ * enum is used within schedutil_freq_util() to differentiate the types of
+ * enum is used within effective_cpu_util() to differentiate the types of
+ * utilization expected by the callers, and adjust the aggregation accordingly.
+ */
+enum cpu_util_type {
+ FREQUENCY_UTIL,
+ ENERGY_UTIL,
+};
+
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum cpu_util_type type,
+ struct task_struct *p);
+
+static inline unsigned long cpu_bw_dl(struct rq *rq)
+{
+ return 0;
+}
+
+static inline unsigned long cpu_util_dl(struct rq *rq)
+{
+ return 0;
+}
+
+static inline unsigned long cpu_util_cfs(struct rq *rq)
+{
+ unsigned long ret = READ_ONCE(rq->load_avg);
+
+ if (ret > SCHED_CAPACITY_SCALE)
+ ret = SCHED_CAPACITY_SCALE;
+ return ret;
+}
+
+static inline unsigned long cpu_util_rt(struct rq *rq)
+{
+ unsigned long ret = READ_ONCE(rq->rt_nr_running);
+
+ if (ret > SCHED_CAPACITY_SCALE)
+ ret = SCHED_CAPACITY_SCALE;
+ return ret;
+}
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+ unsigned long ret = READ_ONCE(rq->irq_load_avg);
+
+ if (ret > SCHED_CAPACITY_SCALE)
+ ret = SCHED_CAPACITY_SCALE;
+ return ret;
+}
+
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+ util *= (max - irq);
+ util /= max;
+
+ return util;
+
+}
+#else /* CONFIG_HAVE_SCHED_AVG_IRQ */
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+ return 0;
+}
+
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+ return util;
+}
+#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */
+#endif /* CONFIG_SMP */
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
+
+DECLARE_STATIC_KEY_FALSE(sched_energy_present);
+
+static inline bool sched_energy_enabled(void)
+{
+ return static_branch_unlikely(&sched_energy_present);
+}
+
+#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
+
+#define perf_domain_span(pd) NULL
+static inline bool sched_energy_enabled(void) { return false; }
+
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+
+#ifdef CONFIG_MEMBARRIER
+/*
+ * The scheduler provides memory barriers required by membarrier between:
+ * - prior user-space memory accesses and store to rq->membarrier_state,
+ * - store to rq->membarrier_state and following user-space memory accesses.
+ * In the same way it provides those guarantees around store to rq->curr.
+ */
+static inline void membarrier_switch_mm(struct rq *rq,
+ struct mm_struct *prev_mm,
+ struct mm_struct *next_mm)
+{
+ int membarrier_state;
+
+ if (prev_mm == next_mm)
+ return;
+
+ membarrier_state = atomic_read(&next_mm->membarrier_state);
+ if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+ return;
+
+ WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+#else
+static inline void membarrier_switch_mm(struct rq *rq,
+ struct mm_struct *prev_mm,
+ struct mm_struct *next_mm)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+ if (!(p->flags & PF_KTHREAD))
+ return false;
+
+ if (p->nr_cpus_allowed != 1)
+ return false;
+
+ return true;
+}
+#endif
+
+void swake_up_all_locked(struct swait_queue_head *q);
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+/* pelt.h compat CONFIG_SCHED_THERMAL_PRESSURE impossible with MUQSS */
+static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return 0;
+}
+
+#ifdef CONFIG_RCU_TORTURE_TEST
+extern int sysctl_sched_rt_runtime;
+#endif
+
+#endif /* MUQSS_SCHED_H */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 4f09afd2f32110..19d6b1a7ec6433 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -568,7 +568,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
struct task_struct *thread;
struct sched_attr attr = {
.size = sizeof(struct sched_attr),
+#ifdef CONFIG_SCHED_MUQSS
+ .sched_policy = SCHED_RR,
+#else
.sched_policy = SCHED_DEADLINE,
+#endif
.sched_flags = SCHED_FLAG_SUGOV,
.sched_nice = 0,
.sched_priority = 0,
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index d6cba0020064cc..935c7dc48e266e 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,6 +17,7 @@ struct cpupri {
int *cpu_to_pri;
};
+#ifndef CONFIG_SCHED_MUQSS
#ifdef CONFIG_SMP
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask);
@@ -27,3 +28,4 @@ void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
#endif
+#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 872e481d5098c8..f8a996d5932c28 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -267,26 +267,6 @@ static inline u64 account_other_time(u64 max)
return accounted;
}
-#ifdef CONFIG_64BIT
-static inline u64 read_sum_exec_runtime(struct task_struct *t)
-{
- return t->se.sum_exec_runtime;
-}
-#else
-static u64 read_sum_exec_runtime(struct task_struct *t)
-{
- u64 ns;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(t, &rf);
- ns = t->se.sum_exec_runtime;
- task_rq_unlock(rq, t, &rf);
-
- return ns;
-}
-#endif
-
/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
* tasks (sum on group iteration) belonging to @tsk's group.
@@ -612,7 +592,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
struct task_cputime cputime = {
- .sum_exec_runtime = p->se.sum_exec_runtime,
+ .sum_exec_runtime = tsk_seruntime(p),
};
task_cputime(p, &cputime.utime, &cputime.stime);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7ca3d3d86c2a5c..2ed8f5ae4d8b2a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -403,6 +403,7 @@ void cpu_startup_entry(enum cpuhp_state state)
do_idle();
}
+#ifndef CONFIG_SCHED_MUQSS
/*
* idle-task scheduling class.
*/
@@ -516,3 +517,4 @@ DEFINE_SCHED_CLASS(idle) = {
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
};
+#endif /* CONFIG_SCHED_MUQSS */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a189bec1372917..48820637193e4f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,19 @@
/*
* Scheduler internal types and methods:
*/
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
+
+/* Begin compatibility wrappers for MuQSS/CFS differences */
+#define rq_rt_nr_running(rq) ((rq)->rt_nr_running)
+#define rq_h_nr_running(rq) ((rq)->nr_running)
+
+#else /* CONFIG_SCHED_MUQSS */
+
+#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running)
+#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running)
+
+
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -990,6 +1003,7 @@ struct rq {
struct callback_head *balance_callback;
+
unsigned char nohz_idle_balance;
unsigned char idle_balance;
@@ -1561,6 +1575,7 @@ extern int group_balance_cpu(struct sched_group *sg);
#ifdef CONFIG_SCHED_DEBUG
void update_sched_domain_debugfs(void);
void dirty_sched_domain_sysctl(int cpu);
+
#else
static inline void update_sched_domain_debugfs(void)
{
@@ -1568,6 +1583,7 @@ static inline void update_sched_domain_debugfs(void)
static inline void dirty_sched_domain_sysctl(int cpu)
{
}
+
#endif
extern int sched_update_scaling(void);
@@ -2749,3 +2765,24 @@ extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
+/* MuQSS compatibility functions */
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ return t->se.sum_exec_runtime;
+}
+#else
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ u64 ns;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(t, &rf);
+ ns = t->se.sum_exec_runtime;
+ task_rq_unlock(rq, t, &rf);
+
+ return ns;
+}
+#endif
+#endif /* CONFIG_SCHED_MUQSS */
\ No newline at end of file
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 55a0a243e87183..210727531bd85c 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -467,7 +467,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
struct root_domain *old_rd = NULL;
unsigned long flags;
+#ifdef CONFIG_SCHED_MUQSS
+ raw_spin_lock_irqsave(rq->lock, flags);
+#else
raw_spin_lock_irqsave(&rq->lock, flags);
+#endif
if (rq->rd) {
old_rd = rq->rd;
@@ -493,7 +497,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
+#ifdef CONFIG_SCHED_MUQSS
+ raw_spin_unlock_irqrestore(rq->lock, flags);
+#else
raw_spin_unlock_irqrestore(&rq->lock, flags);
+#endif
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
diff --git a/kernel/skip_list.c b/kernel/skip_list.c
new file mode 100644
index 00000000000000..bf5c6e97e1396a
--- /dev/null
+++ b/kernel/skip_list.c
@@ -0,0 +1,148 @@
+/*
+ Copyright (C) 2011,2016 Con Kolivas.
+
+ Code based on example originally by William Pugh.
+
+Skip Lists are a probabilistic alternative to balanced trees, as
+described in the June 1990 issue of CACM and were invented by
+William Pugh in 1987.
+
+A couple of comments about this implementation:
+The routine randomLevel has been hard-coded to generate random
+levels using p=0.25. It can be easily changed.
+
+The insertion routine has been implemented so as to use the
+dirty hack described in the CACM paper: if a random level is
+generated that is more than the current maximum level, the
+current maximum level plus one is used instead.
+
+Levels start at zero and go up to MaxLevel (which is equal to
+MaxNumberOfLevels-1).
+
+The routines defined in this file are:
+
+init: defines slnode
+
+new_skiplist: returns a new, empty list
+
+randomLevel: Returns a random level based on a u64 random seed passed to it.
+In MuQSS, the "niffy" time is used for this purpose.
+
+insert(l,key, value): inserts the binding (key, value) into l. This operation
+occurs in O(log n) time.
+
+delnode(slnode, l, node): deletes any binding of key from the l based on the
+actual node value. This operation occurs in O(k) time where k is the
+number of levels of the node in question (max 8). The original delete
+function occurred in O(log n) time and involved a search.
+
+MuQSS Notes: In this implementation of skiplists, there are bidirectional
+next/prev pointers and the insert function returns a pointer to the actual
+node the value is stored. The key here is chosen by the scheduler so as to
+sort tasks according to the priority list requirements and is no longer used
+by the scheduler after insertion. The scheduler lookup, however, occurs in
+O(1) time because it is always the first item in the level 0 linked list.
+Since the task struct stores a copy of the node pointer upon skiplist_insert,
+it can also remove it much faster than the original implementation with the
+aid of prev<->next pointer manipulation and no searching.
+
+*/
+
+#include <linux/slab.h>
+#include <linux/skip_list.h>
+
+#define MaxNumberOfLevels 8
+#define MaxLevel (MaxNumberOfLevels - 1)
+
+void skiplist_init(skiplist_node *slnode)
+{
+ int i;
+
+ slnode->key = 0xFFFFFFFFFFFFFFFF;
+ slnode->level = 0;
+ slnode->value = NULL;
+ for (i = 0; i < MaxNumberOfLevels; i++)
+ slnode->next[i] = slnode->prev[i] = slnode;
+}
+
+skiplist *new_skiplist(skiplist_node *slnode)
+{
+ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC);
+
+ BUG_ON(!l);
+ l->header = slnode;
+ return l;
+}
+
+void free_skiplist(skiplist *l)
+{
+ skiplist_node *p, *q;
+
+ p = l->header;
+ do {
+ q = p->next[0];
+ p->next[0]->prev[0] = q->prev[0];
+ skiplist_node_init(p);
+ p = q;
+ } while (p != l->header);
+ kfree(l);
+}
+
+void skiplist_node_init(skiplist_node *node)
+{
+ memset(node, 0, sizeof(skiplist_node));
+}
+
+static inline unsigned int randomLevel(const long unsigned int randseed)
+{
+ return find_first_bit(&randseed, MaxLevel) / 2;
+}
+
+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed)
+{
+ skiplist_node *update[MaxNumberOfLevels];
+ skiplist_node *p, *q;
+ int k = l->level;
+
+ p = l->header;
+ do {
+ while (q = p->next[k], q->key <= key)
+ p = q;
+ update[k] = p;
+ } while (--k >= 0);
+
+ ++l->entries;
+ k = randomLevel(randseed);
+ if (k > l->level) {
+ k = ++l->level;
+ update[k] = l->header;
+ }
+
+ node->level = k;
+ node->key = key;
+ node->value = value;
+ do {
+ p = update[k];
+ node->next[k] = p->next[k];
+ p->next[k] = node;
+ node->prev[k] = p;
+ node->next[k]->prev[k] = node;
+ } while (--k >= 0);
+}
+
+void skiplist_delete(skiplist *l, skiplist_node *node)
+{
+ int k, m = node->level;
+
+ for (k = 0; k <= m; k++) {
+ node->prev[k]->next[k] = node->next[k];
+ node->next[k]->prev[k] = node->prev[k];
+ }
+ skiplist_node_init(node);
+ if (m == l->level) {
+ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0)
+ m--;
+ l->level = m;
+ }
+ l->entries--;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d4a78e08f6d89c..b2020833509675 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -120,6 +120,14 @@ static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int two_hundred = 200;
static int one_thousand = 1000;
+static int zero = 0;
+static int one = 1;
+#ifdef CONFIG_SCHED_MUQSS
+extern int rr_interval;
+extern int sched_interactive;
+extern int sched_iso_cpu;
+extern int sched_yield_type;
+#endif
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -187,6 +195,10 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
int sysctl_legacy_va_layout;
#endif
+
+
+
+
#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
static int max_extfrag_threshold = 1000;
@@ -1729,6 +1741,7 @@ int proc_do_static_key(struct ctl_table *table, int write,
}
static struct ctl_table kern_table[] = {
+#ifndef CONFIG_SCHED_MUQSS
{
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
@@ -1736,6 +1749,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
@@ -1747,8 +1761,10 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
+
#ifdef CONFIG_NUMA_BALANCING
{
+
.procname = "numa_balancing",
.data = NULL, /* filled in by handler */
.maxlen = sizeof(unsigned int),
@@ -1758,6 +1774,7 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_NUMA_BALANCING */
+
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
@@ -1837,6 +1854,56 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ONE,
},
#endif
+#elif defined(CONFIG_SCHED_MUQSS)
+ {
+ .procname = "rr_interval",
+ .data = &rr_interval,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one_thousand,
+ },
+ {
+ .procname = "interactive",
+ .data = &sched_interactive,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "iso_cpu",
+ .data = &sched_iso_cpu,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+ {
+ .procname = "yield_type",
+ .data = &sched_yield_type,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+#if defined(CONFIG_SMP) && defined(CONFIG_SCHEDSTATS)
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP && CONFIG_SCHEDSTATS */
+#endif /* CONFIG_SCHED_MUQSS */
+
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
{
.procname = "sched_energy_aware",
@@ -2623,6 +2690,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = bpf_unpriv_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
+
},
{
.procname = "bpf_stats_enabled",
@@ -3431,4 +3499,4 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
EXPORT_SYMBOL(proc_doulongvec_minmax);
EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
-EXPORT_SYMBOL(proc_do_large_bitmap);
+EXPORT_SYMBOL(proc_do_large_bitmap);
\ No newline at end of file
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 83e158d016bada..99235d57431e75 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -132,7 +132,7 @@ config CONTEXT_TRACKING
config CONTEXT_TRACKING_FORCE
bool "Force context tracking"
- depends on CONTEXT_TRACKING
+ depends on CONTEXT_TRACKING && !SCHED_MUQSS
default y if !NO_HZ_FULL
help
The major pre-requirement for full dynticks to work is to
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f5490222e134a7..7a61971cca7409 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -190,8 +190,13 @@ int clockevents_tick_resume(struct clock_event_device *dev)
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
+#ifdef CONFIG_SCHED_MUQSS
+/* Limit min_delta to 100us */
+#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000)
+#else
/* Limit min_delta to a jiffie */
#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
+#endif
/**
* clockevents_increase_min_delta - raise minimum delta of a clock event device
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3bb96a8b49c9bd..970b2d9f62b2ed 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples)
u64 stime, utime;
task_cputime(p, &utime, &stime);
- store_samples(samples, stime, utime, p->se.sum_exec_runtime);
+ store_samples(samples, stime, utime, tsk_seruntime(p));
}
static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
@@ -850,7 +850,7 @@ static void check_thread_timers(struct task_struct *tsk,
soft = task_rlimit(tsk, RLIMIT_RTTIME);
if (soft != RLIM_INFINITY) {
/* Task RT timeout is accounted in jiffies. RTTIME is usec */
- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);
+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ);
unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
/* At the hard limit, send SIGKILL. No further action. */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d111adf4a0cb4c..819453065637d9 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1605,7 +1605,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
* Check, if the next hrtimer event is before the next timer wheel
* event:
*/
-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
+static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires)
{
u64 nextevt = hrtimer_get_next_event();
@@ -1623,6 +1623,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
if (nextevt <= basem)
return basem;
+ if (nextevt < expires && nextevt - basem <= TICK_NSEC)
+ base->is_idle = false;
+
/*
* Round up to the next jiffie. High resolution timers are
* off, so the hrtimers are expired in the tick and we need to
@@ -1692,7 +1695,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
}
raw_spin_unlock(&base->lock);
- return cmp_next_hrtimer_event(basem, expires);
+ return cmp_next_hrtimer_event(base, basem, expires);
}
/**
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index adf7ef19400500..5c53163120b05f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1052,10 +1052,15 @@ static int trace_wakeup_test_thread(void *data)
{
/* Make this a -deadline thread */
static const struct sched_attr attr = {
+#ifdef CONFIG_SCHED_MUQSS
+ /* No deadline on MuQSS, use RR */
+ .sched_policy = SCHED_RR,
+#else
.sched_policy = SCHED_DEADLINE,
.sched_runtime = 100000ULL,
.sched_deadline = 10000000ULL,
.sched_period = 10000000ULL
+#endif
};
struct wakeup_test_data *x = data;
diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh
index dd554bd436cc65..75030ad939a4e4 100755
--- a/scripts/headers_install.sh
+++ b/scripts/headers_install.sh
@@ -89,6 +89,7 @@ include/uapi/linux/atmdev.h:CONFIG_COMPAT
include/uapi/linux/eventpoll.h:CONFIG_PM_SLEEP
include/uapi/linux/hw_breakpoint.h:CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
include/uapi/linux/pktcdvd.h:CONFIG_CDROM_PKTCDVD_WCACHE
+include/uapi/linux/sched.h:CONFIG_SCHED_MUQSS
"
for c in $configs
From 96a895d2284936e4a29cc9ec8b639d14cd3dfd56 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Sat, 12 Aug 2017 11:53:39 +1000
Subject: [PATCH 02/34] Create highres timeout variants of schedule_timeout
functions.
---
include/linux/freezer.h | 1 +
include/linux/sched.h | 31 ++++++++++++++++--
kernel/time/hrtimer.c | 69 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 99 insertions(+), 2 deletions(-)
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 0621c5f86c397e..97f58f6c2b6355 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -310,6 +310,7 @@ static inline void set_freezable(void) {}
#define wait_event_freezekillable_unsafe(wq, condition) \
wait_event_killable(wq, condition)
+#define pm_freezing (false)
#endif /* !CONFIG_FREEZER */
#endif /* FREEZER_H_INCLUDED */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d25a51883d81a..11a5a6054ac7ac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -220,13 +220,40 @@ struct task_group;
extern void scheduler_tick(void);
-#define MAX_SCHEDULE_TIMEOUT LONG_MAX
-
+#define MAX_SCHEDULE_TIMEOUT LONG_MAX
extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+extern long schedule_msec_hrtimeout(long timeout);
+extern long schedule_min_hrtimeout(void);
+extern long schedule_msec_hrtimeout_interruptible(long timeout);
+extern long schedule_msec_hrtimeout_uninterruptible(long timeout);
+#else
+static inline long schedule_msec_hrtimeout(long timeout)
+{
+ return schedule_timeout(msecs_to_jiffies(timeout));
+}
+
+static inline long schedule_min_hrtimeout(void)
+{
+ return schedule_timeout(1);
+}
+
+static inline long schedule_msec_hrtimeout_interruptible(long timeout)
+{
+ return schedule_timeout_interruptible(msecs_to_jiffies(timeout));
+}
+
+static inline long schedule_msec_hrtimeout_uninterruptible(long timeout)
+{
+ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout));
+}
+#endif
+
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4a66725b1d4ac0..46757e9c60e115 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2236,3 +2236,72 @@ int __sched schedule_hrtimeout(ktime_t *expires,
return schedule_hrtimeout_range(expires, 0, mode);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout);
+
+/*
+ * As per schedule_hrtimeout but taskes a millisecond value and returns how
+ * many milliseconds are left.
+ */
+long __sched schedule_msec_hrtimeout(long timeout)
+{
+ struct hrtimer_sleeper t;
+ int delta, secs, jiffs;
+ ktime_t expires;
+
+ if (!timeout) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ jiffs = msecs_to_jiffies(timeout);
+ /*
+ * If regular timer resolution is adequate or hrtimer resolution is not
+ * (yet) better than Hz, as would occur during startup, use regular
+ * timers.
+ */
+ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ)
+ return schedule_timeout(jiffs);
+
+ secs = timeout / 1000;
+ delta = (timeout % 1000) * NSEC_PER_MSEC;
+ expires = ktime_set(secs, delta);
+
+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_set_expires_range_ns(&t.timer, expires, delta);
+
+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL);
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ expires = hrtimer_expires_remaining(&t.timer);
+ timeout = ktime_to_ms(expires);
+ return timeout < 0 ? 0 : timeout;
+}
+
+EXPORT_SYMBOL(schedule_msec_hrtimeout);
+
+long __sched schedule_min_hrtimeout(void)
+{
+ return schedule_msec_hrtimeout(1);
+}
+
+EXPORT_SYMBOL(schedule_min_hrtimeout);
+
+long __sched schedule_msec_hrtimeout_interruptible(long timeout)
+{
+ __set_current_state(TASK_INTERRUPTIBLE);
+ return schedule_msec_hrtimeout(timeout);
+}
+EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible);
+
+long __sched schedule_msec_hrtimeout_uninterruptible(long timeout)
+{
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ return schedule_msec_hrtimeout(timeout);
+}
+EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible);
From f85d43e3d0926a4e92ede6562c9fdbb8dfe84faf Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Sat, 5 Nov 2016 09:27:36 +1100
Subject: [PATCH 03/34] Special case calls of schedule_timeout(1) to use the
min hrtimeout of 1ms, working around low Hz resolutions.
---
kernel/time/timer.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 819453065637d9..a510635165ce91 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1889,6 +1889,18 @@ signed long __sched schedule_timeout(signed long timeout)
expire = timeout + jiffies;
+#ifdef CONFIG_HIGH_RES_TIMERS
+ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) {
+ /*
+ * Special case 1 as being a request for the minimum timeout
+ * and use highres timers to timeout after 1ms to workaround
+ * the granularity of low Hz tick timers.
+ */
+ if (!schedule_min_hrtimeout())
+ return 0;
+ goto out_timeout;
+ }
+#endif
timer.task = current;
timer_setup_on_stack(&timer.timer, process_timeout, 0);
__mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
@@ -1897,10 +1909,10 @@ signed long __sched schedule_timeout(signed long timeout)
/* Remove the timer from the object tracker */
destroy_timer_on_stack(&timer.timer);
-
+out_timeout:
timeout = expire - jiffies;
- out:
+out:
return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);
From eaca1708095629258cca59fbd9b2a874214b51a5 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 4 Nov 2016 09:25:54 +1100
Subject: [PATCH 04/34] Convert msleep to use hrtimers when active.
---
kernel/time/timer.c | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a510635165ce91..d4a9e45e0e2346 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2055,7 +2055,19 @@ void __init init_timers(void)
*/
void msleep(unsigned int msecs)
{
- unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+ int jiffs = msecs_to_jiffies(msecs);
+ unsigned long timeout;
+
+ /*
+ * Use high resolution timers where the resolution of tick based
+ * timers is inadequate.
+ */
+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) {
+ while (msecs)
+ msecs = schedule_msec_hrtimeout_uninterruptible(msecs);
+ return;
+ }
+ timeout = msecs_to_jiffies(msecs) + 1;
while (timeout)
timeout = schedule_timeout_uninterruptible(timeout);
@@ -2069,7 +2081,15 @@ EXPORT_SYMBOL(msleep);
*/
unsigned long msleep_interruptible(unsigned int msecs)
{
- unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+ int jiffs = msecs_to_jiffies(msecs);
+ unsigned long timeout;
+
+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) {
+ while (msecs && !signal_pending(current))
+ msecs = schedule_msec_hrtimeout_interruptible(msecs);
+ return msecs;
+ }
+ timeout = msecs_to_jiffies(msecs) + 1;
while (timeout && !signal_pending(current))
timeout = schedule_timeout_interruptible(timeout);
From 3760ffa8a6046ea24c73717de4e809c319224d68 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Mon, 20 Feb 2017 13:28:30 +1100
Subject: [PATCH 05/34] Replace all schedule timeout(1) with
schedule_min_hrtimeout()
---
.../accessibility/speakup/speakup_acntpc.c | 4 +--
.../accessibility/speakup/speakup_apollo.c | 2 +-
.../accessibility/speakup/speakup_decext.c | 2 +-
drivers/accessibility/speakup/speakup_decpc.c | 2 +-
.../accessibility/speakup/speakup_dectlk.c | 2 +-
drivers/accessibility/speakup/speakup_dtlk.c | 4 +--
drivers/accessibility/speakup/speakup_keypc.c | 4 +--
drivers/accessibility/speakup/synth.c | 14 +++------
drivers/block/swim.c | 5 ++--
drivers/char/ipmi/ipmi_msghandler.c | 2 +-
drivers/char/ipmi/ipmi_ssif.c | 2 +-
drivers/comedi/drivers/ni_mio_common.c | 2 +-
drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c | 2 +-
drivers/gpu/drm/vmwgfx/vmwgfx_irq.c | 2 +-
drivers/media/pci/ivtv/ivtv-ioctl.c | 2 +-
drivers/media/pci/ivtv/ivtv-streams.c | 2 +-
drivers/mfd/ucb1x00-core.c | 2 +-
drivers/misc/sgi-xp/xpc_channel.c | 2 +-
drivers/net/caif/caif_hsi.c | 2 +-
drivers/net/can/usb/peak_usb/pcan_usb.c | 2 +-
drivers/net/usb/lan78xx.c | 2 +-
drivers/net/usb/usbnet.c | 2 +-
drivers/scsi/fnic/fnic_scsi.c | 29 +++++++++++++++++--
drivers/scsi/snic/snic_scsi.c | 2 +-
drivers/staging/rts5208/rtsx.c | 2 +-
.../staging/unisys/visornic/visornic_main.c | 6 ++--
drivers/video/fbdev/omap/hwa742.c | 2 +-
drivers/video/fbdev/pxafb.c | 2 +-
sound/usb/line6/pcm.c | 2 +-
29 files changed, 64 insertions(+), 46 deletions(-)
diff --git a/drivers/accessibility/speakup/speakup_acntpc.c b/drivers/accessibility/speakup/speakup_acntpc.c
index c1ec087dca1307..b2d0d4266f62d2 100644
--- a/drivers/accessibility/speakup/speakup_acntpc.c
+++ b/drivers/accessibility/speakup/speakup_acntpc.c
@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth)
full_time_val = full_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
if (synth_full()) {
- schedule_timeout(msecs_to_jiffies(full_time_val));
+ schedule_msec_hrtimeout((full_time_val));
continue;
}
set_current_state(TASK_RUNNING);
@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth)
jiffy_delta_val = jiffy_delta->u.n.value;
delay_time_val = delay_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout(delay_time_val);
jiff_max = jiffies + jiffy_delta_val;
}
}
diff --git a/drivers/accessibility/speakup/speakup_apollo.c b/drivers/accessibility/speakup/speakup_apollo.c
index cd63581b2e99ed..d636157a28448f 100644
--- a/drivers/accessibility/speakup/speakup_apollo.c
+++ b/drivers/accessibility/speakup/speakup_apollo.c
@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth)
if (!synth->io_ops->synth_out(synth, ch)) {
synth->io_ops->tiocmset(synth, 0, UART_MCR_RTS);
synth->io_ops->tiocmset(synth, UART_MCR_RTS, 0);
- schedule_timeout(msecs_to_jiffies(full_time_val));
+ schedule_msec_hrtimeout(full_time_val);
continue;
}
if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) {
diff --git a/drivers/accessibility/speakup/speakup_decext.c b/drivers/accessibility/speakup/speakup_decext.c
index 092cfd08a9e18d..e7fc85f8ce5c84 100644
--- a/drivers/accessibility/speakup/speakup_decext.c
+++ b/drivers/accessibility/speakup/speakup_decext.c
@@ -180,7 +180,7 @@ static void do_catch_up(struct spk_synth *synth)
if (ch == '\n')
ch = 0x0D;
if (synth_full() || !synth->io_ops->synth_out(synth, ch)) {
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout(delay_time_val);
continue;
}
set_current_state(TASK_RUNNING);
diff --git a/drivers/accessibility/speakup/speakup_decpc.c b/drivers/accessibility/speakup/speakup_decpc.c
index dec314dee214f0..2a5deb5256b25a 100644
--- a/drivers/accessibility/speakup/speakup_decpc.c
+++ b/drivers/accessibility/speakup/speakup_decpc.c
@@ -398,7 +398,7 @@ static void do_catch_up(struct spk_synth *synth)
if (ch == '\n')
ch = 0x0D;
if (dt_sendchar(ch)) {
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout((delay_time_val));
continue;
}
set_current_state(TASK_RUNNING);
diff --git a/drivers/accessibility/speakup/speakup_dectlk.c b/drivers/accessibility/speakup/speakup_dectlk.c
index 580ec796816bcf..67c156b90ddbc4 100644
--- a/drivers/accessibility/speakup/speakup_dectlk.c
+++ b/drivers/accessibility/speakup/speakup_dectlk.c
@@ -256,7 +256,7 @@ static void do_catch_up(struct spk_synth *synth)
if (ch == '\n')
ch = 0x0D;
if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) {
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout(delay_time_val);
continue;
}
set_current_state(TASK_RUNNING);
diff --git a/drivers/accessibility/speakup/speakup_dtlk.c b/drivers/accessibility/speakup/speakup_dtlk.c
index 92838d3ae9eb8d..b687cb4d32683e 100644
--- a/drivers/accessibility/speakup/speakup_dtlk.c
+++ b/drivers/accessibility/speakup/speakup_dtlk.c
@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth)
delay_time_val = delay_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
if (synth_full()) {
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout((delay_time_val));
continue;
}
set_current_state(TASK_RUNNING);
@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth)
delay_time_val = delay_time->u.n.value;
jiffy_delta_val = jiffy_delta->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout((delay_time_val));
jiff_max = jiffies + jiffy_delta_val;
}
}
diff --git a/drivers/accessibility/speakup/speakup_keypc.c b/drivers/accessibility/speakup/speakup_keypc.c
index 311f4aa0be22d6..99c523fdcc98f5 100644
--- a/drivers/accessibility/speakup/speakup_keypc.c
+++ b/drivers/accessibility/speakup/speakup_keypc.c
@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth)
full_time_val = full_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
if (synth_full()) {
- schedule_timeout(msecs_to_jiffies(full_time_val));
+ schedule_msec_hrtimeout((full_time_val));
continue;
}
set_current_state(TASK_RUNNING);
@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth)
jiffy_delta_val = jiffy_delta->u.n.value;
delay_time_val = delay_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
- schedule_timeout(msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout(delay_time_val);
jiff_max = jiffies + jiffy_delta_val;
}
}
diff --git a/drivers/accessibility/speakup/synth.c b/drivers/accessibility/speakup/synth.c
index 2b8699673bace5..bf0cbdaf564f80 100644
--- a/drivers/accessibility/speakup/synth.c
+++ b/drivers/accessibility/speakup/synth.c
@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode)
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
if (ch == '\n')
ch = synth->procspeech;
- if (unicode)
- ret = synth->io_ops->synth_out_unicode(synth, ch);
- else
- ret = synth->io_ops->synth_out(synth, ch);
- if (!ret) {
- schedule_timeout(msecs_to_jiffies(full_time_val));
+ if (!synth->io_ops->synth_out(synth, ch)) {
+ schedule_msec_hrtimeout(full_time_val);
continue;
}
if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) {
@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode)
full_time_val = full_time->u.n.value;
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
if (synth->io_ops->synth_out(synth, synth->procspeech))
- schedule_timeout(
- msecs_to_jiffies(delay_time_val));
+ schedule_msec_hrtimeout(delay_time_val);
else
- schedule_timeout(
- msecs_to_jiffies(full_time_val));
+ schedule_msec_hrtimeout(full_time_val);
jiff_max = jiffies + jiffy_delta_val;
}
set_current_state(TASK_RUNNING);
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 2917b21f48ff27..fcad8d963cd893 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base,
if (swim_readbit(base, MOTOR_ON))
break;
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
}
} else if (action == OFF) {
swim_action(base, MOTOR_OFF);
@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base)
if (!swim_readbit(base, DISK_IN))
break;
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
}
swim_select(base, RELAX);
}
@@ -372,6 +372,7 @@ static inline int swim_step(struct swim __iomem *base)
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(1);
+ schedule_min_hrtimeout();
swim_select(base, RELAX);
if (!swim_readbit(base, STEP))
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 8a0e97b33caef8..b5b22e487c0032 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -3548,7 +3548,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf)
/* Current message first, to preserve order */
while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) {
/* Wait for the message to clear out. */
- schedule_timeout(1);
+ schedule_min_hrtimeout();
}
/* No need for locks, the interface is down. */
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 20d5af92966d43..05b8d6a8171ff3 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -1247,7 +1247,7 @@ static void shutdown_ssif(void *send_info)
/* make sure the driver is not looking for flags any more. */
while (ssif_info->ssif_state != SSIF_NORMAL)
- schedule_timeout(1);
+ schedule_min_hrtimeout();
ssif_info->stopping = true;
del_timer_sync(&ssif_info->watch_timer);
diff --git a/drivers/comedi/drivers/ni_mio_common.c b/drivers/comedi/drivers/ni_mio_common.c
index 4f80a4991f953a..c164c85249098b 100644
--- a/drivers/comedi/drivers/ni_mio_common.c
+++ b/drivers/comedi/drivers/ni_mio_common.c
@@ -4747,7 +4747,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev)
if ((status & NI67XX_CAL_STATUS_BUSY) == 0)
break;
set_current_state(TASK_INTERRUPTIBLE);
- if (schedule_timeout(1))
+ if (schedule_min_hrtimeout())
return -EIO;
}
if (i == timeout) {
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
index 20246a7c97c9dc..499a4542383c57 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
@@ -227,7 +227,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv,
DRM_ERROR("SVGA device lockup.\n");
break;
}
- schedule_timeout(1);
+ schedule_min_hrtimeout();
if (interruptible && signal_pending(current)) {
ret = -ERESTARTSYS;
break;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
index 6c2a569f1fcb18..8d7feeb0d7abcd 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
@@ -201,7 +201,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv,
break;
}
if (lazy)
- schedule_timeout(1);
+ schedule_min_hrtimeout();
else if ((++count & 0x0F) == 0) {
/**
* FIXME: Use schedule_hr_timeout here for
diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c
index 35dccb31174c1e..8181cd65e87646 100644
--- a/drivers/media/pci/ivtv/ivtv-ioctl.c
+++ b/drivers/media/pci/ivtv/ivtv-ioctl.c
@@ -1139,7 +1139,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std)
TASK_UNINTERRUPTIBLE);
if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100)
break;
- schedule_timeout(msecs_to_jiffies(25));
+ schedule_msec_hrtimeout((25));
}
finish_wait(&itv->vsync_waitq, &wait);
mutex_lock(&itv->serialize_lock);
diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c
index f04ee84bab5fd3..c4469b4b8f99c1 100644
--- a/drivers/media/pci/ivtv/ivtv-streams.c
+++ b/drivers/media/pci/ivtv/ivtv-streams.c
@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end)
while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) &&
time_before(jiffies,
then + msecs_to_jiffies(2000))) {
- schedule_timeout(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout((10));
}
/* To convert jiffies to ms, we must multiply by 1000
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index b690796d24d4be..448b13da62b435 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync)
break;
/* yield to other processes */
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
}
return UCB_ADC_DAT(val);
diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c
index 8e6607fc8a67be..b9ab770bbdb5d6 100644
--- a/drivers/misc/sgi-xp/xpc_channel.c
+++ b/drivers/misc/sgi-xp/xpc_channel.c
@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch)
atomic_inc(&ch->n_on_msg_allocate_wq);
prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE);
- ret = schedule_timeout(1);
+ ret = schedule_min_hrtimeout();
finish_wait(&ch->msg_allocate_wq, &wait);
atomic_dec(&ch->n_on_msg_allocate_wq);
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
index 3d63b15bbaa10c..164071e9d4579c 100644
--- a/drivers/net/caif/caif_hsi.c
+++ b/drivers/net/caif/caif_hsi.c
@@ -924,7 +924,7 @@ static void cfhsi_wake_down(struct work_struct *work)
break;
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
retry--;
}
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c
index 1d6f77252f0184..6ce9be987266c9 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c
@@ -299,7 +299,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff)
} else {
/* the PCAN-USB needs time to init */
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT));
+ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT));
}
return err;
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 02bce40a67e5ba..69758f825a4da4 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -2655,7 +2655,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev)
while (!skb_queue_empty(&dev->rxq) &&
!skb_queue_empty(&dev->txq) &&
!skb_queue_empty(&dev->done)) {
- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS));
set_current_state(TASK_UNINTERRUPTIBLE);
netif_dbg(dev, ifdown, dev->net,
"waited for %d urb completions\n", temp);
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index ecf62849f4c1fc..ed6b424a59709a 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -764,7 +764,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q)
spin_lock_irqsave(&q->lock, flags);
while (!skb_queue_empty(q)) {
spin_unlock_irqrestore(&q->lock, flags);
- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS));
set_current_state(TASK_UNINTERRUPTIBLE);
spin_lock_irqsave(&q->lock, flags);
}
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 762cc8bd2653fc..75c557c08ac6f3 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -217,7 +217,7 @@ int fnic_fw_reset_handler(struct fnic *fnic)
/* wait for io cmpl */
while (atomic_read(&fnic->in_flight))
- schedule_timeout(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout((1));
spin_lock_irqsave(&fnic->wq_copy_lock[0], flags);
@@ -1367,13 +1367,16 @@ static bool fnic_cleanup_io_iter(struct scsi_cmnd *sc, void *data,
struct fnic *fnic = data;
struct fnic_io_req *io_req;
unsigned long flags = 0;
+
spinlock_t *io_lock;
unsigned long start_time = 0;
struct fnic_stats *fnic_stats = &fnic->fnic_stats;
io_lock = fnic_io_lock_tag(fnic, sc->request->tag);
+
spin_lock_irqsave(io_lock, flags);
+
io_req = (struct fnic_io_req *)CMD_SP(sc);
if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) &&
!(CMD_FLAGS(sc) & FNIC_DEV_RST_DONE)) {
@@ -1415,6 +1418,7 @@ static bool fnic_cleanup_io_iter(struct scsi_cmnd *sc, void *data,
"fnic_cleanup_io: tag:0x%x : sc:0x%p duration = %lu DID_TRANSPORT_DISRUPTED\n",
sc->request->tag, sc, (jiffies - start_time));
+
if (atomic64_read(&fnic->io_cmpl_skip))
atomic64_dec(&fnic->io_cmpl_skip);
else
@@ -1570,14 +1574,17 @@ static bool fnic_rport_abort_io_iter(struct scsi_cmnd *sc, void *data,
struct fnic_io_req *io_req;
spinlock_t *io_lock;
unsigned long flags;
+
struct reset_stats *reset_stats = &fnic->fnic_stats.reset_stats;
struct terminate_stats *term_stats = &fnic->fnic_stats.term_stats;
struct scsi_lun fc_lun;
enum fnic_ioreq_state old_ioreq_state;
io_lock = fnic_io_lock_tag(fnic, abt_tag);
+
spin_lock_irqsave(io_lock, flags);
+
io_req = (struct fnic_io_req *)CMD_SP(sc);
if (!io_req || io_req->port_id != iter_data->port_id) {
@@ -1622,6 +1629,7 @@ static bool fnic_rport_abort_io_iter(struct scsi_cmnd *sc, void *data,
if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) {
atomic64_inc(&reset_stats->device_reset_terminates);
abt_tag |= FNIC_TAG_DEV_RST;
+
}
FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host,
"fnic_rport_exch_reset dev rst sc 0x%p\n", sc);
@@ -1687,10 +1695,12 @@ static void fnic_rport_exch_reset(struct fnic *fnic, u32 port_id)
void fnic_terminate_rport_io(struct fc_rport *rport)
{
+
struct fc_rport_libfc_priv *rdata;
struct fc_lport *lport;
struct fnic *fnic;
+
if (!rport) {
printk(KERN_ERR "fnic_terminate_rport_io: rport is NULL\n");
return;
@@ -1718,6 +1728,7 @@ void fnic_terminate_rport_io(struct fc_rport *rport)
return;
fnic_rport_exch_reset(fnic, rport->port_id);
+
}
/*
@@ -2019,6 +2030,7 @@ struct fnic_pending_aborts_iter_data {
int ret;
};
+
static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc,
void *data, bool reserved)
{
@@ -2029,7 +2041,9 @@ static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc,
struct fnic_io_req *io_req;
spinlock_t *io_lock;
unsigned long flags;
+
struct scsi_lun fc_lun;
+
DECLARE_COMPLETION_ONSTACK(tm_done);
enum fnic_ioreq_state old_ioreq_state;
@@ -2038,10 +2052,12 @@ static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc,
if (reserved)
return true;
+
io_lock = fnic_io_lock_tag(fnic, abt_tag);
spin_lock_irqsave(io_lock, flags);
io_req = (struct fnic_io_req *)CMD_SP(sc);
if (!io_req) {
+
spin_unlock_irqrestore(io_lock, flags);
return true;
}
@@ -2083,6 +2099,7 @@ static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc,
BUG_ON(io_req->abts_done);
+
if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) {
abt_tag |= FNIC_TAG_DEV_RST;
FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host,
@@ -2117,6 +2134,7 @@ static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc,
CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED;
wait_for_completion_timeout(&tm_done, msecs_to_jiffies
+
(fnic->config.ed_tov));
/* Recheck cmd state to check if it is now aborted */
@@ -2189,7 +2207,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic,
ret = iter_data.ret;
goto clean_pending_aborts_end;
}
- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov));
+ schedule_msec_hrtimeout((2 * fnic->config.ed_tov));
/* walk again to check, if IOs are still pending in fw */
if (fnic_is_abts_pending(fnic, lr_sc))
@@ -2689,6 +2707,7 @@ void fnic_exch_mgr_reset(struct fc_lport *lp, u32 sid, u32 did)
static bool fnic_abts_pending_iter(struct scsi_cmnd *sc, void *data,
bool reserved)
+
{
struct fnic_pending_aborts_iter_data *iter_data = data;
struct fnic *fnic = iter_data->fnic;
@@ -2697,6 +2716,8 @@ static bool fnic_abts_pending_iter(struct scsi_cmnd *sc, void *data,
spinlock_t *io_lock;
unsigned long flags;
+
+
/*
* ignore this lun reset cmd or cmds that do not belong to
* this lun
@@ -2711,6 +2732,7 @@ static bool fnic_abts_pending_iter(struct scsi_cmnd *sc, void *data,
io_req = (struct fnic_io_req *)CMD_SP(sc);
if (!io_req) {
+
spin_unlock_irqrestore(io_lock, flags);
return true;
}
@@ -2723,6 +2745,7 @@ static bool fnic_abts_pending_iter(struct scsi_cmnd *sc, void *data,
"Found IO in %s on lun\n",
fnic_ioreq_state_to_str(CMD_STATE(sc)));
cmd_state = CMD_STATE(sc);
+
spin_unlock_irqrestore(io_lock, flags);
if (cmd_state == FNIC_IOREQ_ABTS_PENDING)
iter_data->ret = 1;
@@ -2755,4 +2778,4 @@ int fnic_is_abts_pending(struct fnic *fnic, struct scsi_cmnd *lr_sc)
fnic_abts_pending_iter, &iter_data);
return iter_data.ret;
-}
+}
\ No newline at end of file
diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c
index 6dd0ff188bb49f..aedf0b78f62290 100644
--- a/drivers/scsi/snic/snic_scsi.c
+++ b/drivers/scsi/snic/snic_scsi.c
@@ -2349,7 +2349,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc)
/* Wait for all the IOs that are entered in Qcmd */
while (atomic_read(&snic->ios_inflight))
- schedule_timeout(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout((1));
ret = snic_issue_hba_reset(snic, sc);
if (ret) {
diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c
index 898add4d1fc881..0aa9dd4673492e 100644
--- a/drivers/staging/rts5208/rtsx.c
+++ b/drivers/staging/rts5208/rtsx.c
@@ -477,7 +477,7 @@ static int rtsx_polling_thread(void *__dev)
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL));
+ schedule_msec_hrtimeout((POLLING_INTERVAL));
/* lock the device pointers */
mutex_lock(&dev->dev_mutex);
diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c
index a3bc568c660d4d..e7cac6df0202aa 100644
--- a/drivers/staging/unisys/visornic/visornic_main.c
+++ b/drivers/staging/unisys/visornic/visornic_main.c
@@ -547,7 +547,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev,
}
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irqrestore(&devdata->priv_lock, flags);
- wait += schedule_timeout(msecs_to_jiffies(10));
+ wait += schedule_msec_hrtimeout((10));
spin_lock_irqsave(&devdata->priv_lock, flags);
}
@@ -558,7 +558,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev,
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irqrestore(&devdata->priv_lock, flags);
- schedule_timeout(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout((10));
spin_lock_irqsave(&devdata->priv_lock, flags);
if (atomic_read(&devdata->usage))
break;
@@ -712,7 +712,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev,
}
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irqrestore(&devdata->priv_lock, flags);
- wait += schedule_timeout(msecs_to_jiffies(10));
+ wait += schedule_msec_hrtimeout((10));
spin_lock_irqsave(&devdata->priv_lock, flags);
}
diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c
index b191bef22d9845..a8c491e1d844e0 100644
--- a/drivers/video/fbdev/omap/hwa742.c
+++ b/drivers/video/fbdev/omap/hwa742.c
@@ -927,7 +927,7 @@ static void hwa742_resume(void)
if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7))
break;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(5));
+ schedule_msec_hrtimeout((5));
}
hwa742_set_update_mode(hwa742.update_mode_before_suspend);
}
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c
index f1551e00eb12f1..f0f651e925044a 100644
--- a/drivers/video/fbdev/pxafb.c
+++ b/drivers/video/fbdev/pxafb.c
@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg)
mutex_unlock(&fbi->ctrlr_lock);
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(30));
+ schedule_msec_hrtimeout((30));
}
pr_debug("%s(): task ending\n", __func__);
diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c
index fdbdfb7bce92b0..fa8e8faf3eb34b 100644
--- a/sound/usb/line6/pcm.c
+++ b/sound/usb/line6/pcm.c
@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm,
if (!alive)
break;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_min_hrtimeout();
} while (--timeout > 0);
if (alive)
dev_err(line6pcm->line6->ifcdev,
From 442a069df90576d152f82724727e70307a7be62f Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Mon, 20 Feb 2017 13:30:07 +1100
Subject: [PATCH 06/34] Replace all calls to schedule_timeout_interruptible of
potentially under 50ms to use schedule_msec_hrtimeout_interruptible.
---
drivers/hwmon/fam15h_power.c | 2 +-
drivers/iio/light/tsl2563.c | 6 +-----
drivers/media/i2c/msp3400-driver.c | 4 ++--
drivers/media/pci/ivtv/ivtv-gpio.c | 6 +++---
drivers/media/radio/radio-mr800.c | 2 +-
drivers/media/radio/radio-tea5777.c | 2 +-
drivers/media/radio/tea575x.c | 2 +-
drivers/parport/ieee1284.c | 2 +-
drivers/parport/ieee1284_ops.c | 2 +-
drivers/platform/x86/intel_ips.c | 8 ++++----
net/core/pktgen.c | 2 +-
sound/soc/codecs/wm8350.c | 12 ++++++------
sound/soc/codecs/wm8900.c | 2 +-
sound/soc/codecs/wm9713.c | 4 ++--
14 files changed, 26 insertions(+), 30 deletions(-)
diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c
index 29f5fed28c2a78..974cb08c7aa715 100644
--- a/drivers/hwmon/fam15h_power.c
+++ b/drivers/hwmon/fam15h_power.c
@@ -221,7 +221,7 @@ static ssize_t power1_average_show(struct device *dev,
prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu];
}
- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period));
+ leftover = schedule_msec_hrtimeout_interruptible((data->power_period));
if (leftover)
return 0;
diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c
index 5bf2bfbc5379eb..6ce37819fb73f0 100644
--- a/drivers/iio/light/tsl2563.c
+++ b/drivers/iio/light/tsl2563.c
@@ -271,11 +271,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip)
default:
delay = 402;
}
- /*
- * TODO: Make sure that we wait at least required delay but why we
- * have to extend it one tick more?
- */
- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2);
+ schedule_msec_hrtimeout_interruptible(delay + 1);
}
static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc)
diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c
index 39530d43590e3c..a7caf2eb57712b 100644
--- a/drivers/media/i2c/msp3400-driver.c
+++ b/drivers/media/i2c/msp3400-driver.c
@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr)
break;
dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err,
dev, addr);
- schedule_timeout_interruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_interruptible((10));
}
if (err == 3) {
dev_warn(&client->dev, "resetting chip, sound will go off.\n");
@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val)
break;
dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err,
dev, addr);
- schedule_timeout_interruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_interruptible((10));
}
if (err == 3) {
dev_warn(&client->dev, "resetting chip, sound will go off.\n");
diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c
index 856e7ab7f33e80..766a262513373c 100644
--- a/drivers/media/pci/ivtv/ivtv-gpio.c
+++ b/drivers/media/pci/ivtv/ivtv-gpio.c
@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv)
curout = (curout & ~0xF) | 1;
write_reg(curout, IVTV_REG_GPIO_OUT);
/* We could use something else for smaller time */
- schedule_timeout_interruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_interruptible((1));
curout |= 2;
write_reg(curout, IVTV_REG_GPIO_OUT);
curdir &= ~0x80;
@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value)
curout = read_reg(IVTV_REG_GPIO_OUT);
curout &= ~(1 << itv->card->xceive_pin);
write_reg(curout, IVTV_REG_GPIO_OUT);
- schedule_timeout_interruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_interruptible((1));
curout |= 1 << itv->card->xceive_pin;
write_reg(curout, IVTV_REG_GPIO_OUT);
- schedule_timeout_interruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_interruptible((1));
return 0;
}
diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c
index cb0437b4c33125..163fffc0e1d476 100644
--- a/drivers/media/radio/radio-mr800.c
+++ b/drivers/media/radio/radio-mr800.c
@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv,
retval = -ENODATA;
break;
}
- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) {
+ if (schedule_msec_hrtimeout_interruptible((10))) {
retval = -ERESTARTSYS;
break;
}
diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c
index fb9de7bbcd195e..e53cf45e7f3fe8 100644
--- a/drivers/media/radio/radio-tea5777.c
+++ b/drivers/media/radio/radio-tea5777.c
@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait)
}
if (wait) {
- if (schedule_timeout_interruptible(msecs_to_jiffies(wait)))
+ if (schedule_msec_hrtimeout_interruptible((wait)))
return -ERESTARTSYS;
}
diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c
index c37315226c427a..e73e6393403c84 100644
--- a/drivers/media/radio/tea575x.c
+++ b/drivers/media/radio/tea575x.c
@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea,
for (;;) {
if (time_after(jiffies, timeout))
break;
- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) {
+ if (schedule_msec_hrtimeout_interruptible((10))) {
/* some signal arrived, stop search */
tea->val &= ~TEA575X_BIT_SEARCH;
snd_tea575x_set_freq(tea);
diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c
index 4547ac44c8d48d..8fa1a7fdf12ccb 100644
--- a/drivers/parport/ieee1284.c
+++ b/drivers/parport/ieee1284.c
@@ -202,7 +202,7 @@ int parport_wait_peripheral(struct parport *port,
/* parport_wait_event didn't time out, but the
* peripheral wasn't actually ready either.
* Wait for another 10ms. */
- schedule_timeout_interruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_interruptible((10));
}
}
diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c
index 2c11bd3fe1fd68..8cb6b61c0880de 100644
--- a/drivers/parport/ieee1284_ops.c
+++ b/drivers/parport/ieee1284_ops.c
@@ -520,7 +520,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port,
/* Yield the port for a while. */
if (count && dev->port->irq != PARPORT_IRQ_NONE) {
parport_release (dev);
- schedule_timeout_interruptible(msecs_to_jiffies(40));
+ schedule_msec_hrtimeout_interruptible((40));
parport_claim_or_block (dev);
}
else
diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c
index bffe548187eed3..c2918ee3e100a5 100644
--- a/drivers/platform/x86/intel_ips.c
+++ b/drivers/platform/x86/intel_ips.c
@@ -798,7 +798,7 @@ static int ips_adjust(void *data)
ips_gpu_lower(ips);
sleep:
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD));
+ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD));
} while (!kthread_should_stop());
dev_dbg(ips->dev, "ips-adjust thread stopped\n");
@@ -974,7 +974,7 @@ static int ips_monitor(void *data)
seqno_timestamp = get_jiffies_64();
old_cpu_power = thm_readl(THM_CEC);
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD));
+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD));
/* Collect an initial average */
for (i = 0; i < IPS_SAMPLE_COUNT; i++) {
@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data)
mchp_samples[i] = mchp;
}
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD));
+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD));
if (kthread_should_stop())
break;
}
@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data)
* us to reduce the sample frequency if the CPU and GPU are idle.
*/
old_cpu_power = thm_readl(THM_CEC);
- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD));
+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD));
last_sample_period = IPS_SAMPLE_PERIOD;
timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 3fba429f1f57b0..9a3a9a6eb8377a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname)
mutex_unlock(&pktgen_thread_lock);
pr_debug("%s: waiting for %s to disappear....\n",
__func__, ifname);
- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try));
+ schedule_msec_hrtimeout_interruptible((msec_per_try));
mutex_lock(&pktgen_thread_lock);
if (++i >= max_tries) {
diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c
index 15d42ce3b21d67..897fced9589bc6 100644
--- a/sound/soc/codecs/wm8350.c
+++ b/sound/soc/codecs/wm8350.c
@@ -234,10 +234,10 @@ static void wm8350_pga_work(struct work_struct *work)
out2->ramp == WM8350_RAMP_UP) {
/* delay is longer over 0dB as increases are larger */
if (i >= WM8350_OUTn_0dB)
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(2));
else
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(1));
} else
udelay(50); /* doesn't matter if we delay longer */
@@ -1121,7 +1121,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component,
(platform->dis_out4 << 6));
/* wait for discharge */
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(platform->
cap_discharge_msecs));
@@ -1137,7 +1137,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component,
WM8350_VBUFEN);
/* wait for vmid */
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(platform->
vmid_charge_msecs));
@@ -1188,7 +1188,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component,
wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1);
/* wait */
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(platform->
vmid_discharge_msecs));
@@ -1206,7 +1206,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component,
pm1 | WM8350_OUTPUT_DRAIN_EN);
/* wait */
- schedule_timeout_interruptible(msecs_to_jiffies
+ schedule_msec_hrtimeout_interruptible(
(platform->drain_msecs));
pm1 &= ~WM8350_BIASEN;
diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c
index a9a6d766a17631..45bf31de6282a9 100644
--- a/sound/soc/codecs/wm8900.c
+++ b/sound/soc/codecs/wm8900.c
@@ -1104,7 +1104,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component,
/* Need to let things settle before stopping the clock
* to ensure that restart works, see "Stopping the
* master clock" in the datasheet. */
- schedule_timeout_interruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_interruptible(1);
snd_soc_component_write(component, WM8900_REG_POWER2,
WM8900_REG_POWER2_SYSCLK_ENA);
break;
diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c
index e0ce32dd4a8115..eb91c0282aad4f 100644
--- a/sound/soc/codecs/wm9713.c
+++ b/sound/soc/codecs/wm9713.c
@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w,
/* Gracefully shut down the voice interface. */
snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200);
- schedule_timeout_interruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_interruptible(1);
snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00);
snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000);
@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component,
wm9713->pll_in = freq_in;
/* wait 10ms AC97 link frames for the link to stabilise */
- schedule_timeout_interruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_interruptible((10));
return 0;
}
From ab42a98e73e18e1393a38ae0920bfb38ada869a8 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Mon, 20 Feb 2017 13:30:32 +1100
Subject: [PATCH 07/34] Replace all calls to schedule_timeout_uninterruptible
of potentially under 50ms to use schedule_msec_hrtimeout_uninterruptible
---
drivers/media/pci/cx18/cx18-gpio.c | 4 ++--
drivers/net/wireless/intel/ipw2x00/ipw2100.c | 4 ++--
drivers/rtc/rtc-wm8350.c | 6 +++---
drivers/scsi/lpfc/lpfc_scsi.c | 2 +-
sound/pci/maestro3.c | 4 ++--
sound/soc/codecs/rt5631.c | 4 ++--
sound/soc/soc-dapm.c | 2 +-
7 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c
index cf7cfda9410716..f63e17489547ce 100644
--- a/drivers/media/pci/cx18/cx18-gpio.c
+++ b/drivers/media/pci/cx18/cx18-gpio.c
@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi,
/* Assert */
gpio_update(cx, mask, ~active_lo);
- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs));
+ schedule_msec_hrtimeout_uninterruptible((assert_msecs));
/* Deassert */
gpio_update(cx, mask, ~active_hi);
- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs));
+ schedule_msec_hrtimeout_uninterruptible((recovery_msecs));
}
/*
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
index 23fbddd0c1f8e0..534ab3b894e2db 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
@@ -815,7 +815,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv,
* doesn't seem to have as many firmware restart cycles...
*
* As a test, we're sticking in a 1/100s delay here */
- schedule_timeout_uninterruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_uninterruptible((10));
return 0;
@@ -1266,7 +1266,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv)
IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n");
i = 5000;
do {
- schedule_timeout_uninterruptible(msecs_to_jiffies(40));
+ schedule_msec_hrtimeout_uninterruptible((40));
/* Todo... wait for sync command ... */
read_register(priv->net_dev, IPW_REG_INTA, &inta);
diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c
index 2018614f258f6f..fc19b312c345fb 100644
--- a/drivers/rtc/rtc-wm8350.c
+++ b/drivers/rtc/rtc-wm8350.c
@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm)
/* Wait until confirmation of stopping */
do {
rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL);
- schedule_timeout_uninterruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_uninterruptible((1));
} while (--retries && !(rtc_ctrl & WM8350_RTC_STS));
if (!retries) {
@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350)
/* Wait until confirmation of stopping */
do {
rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL);
- schedule_timeout_uninterruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_uninterruptible((1));
} while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS));
if (!(rtc_ctrl & WM8350_RTC_ALMSTS))
@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350)
/* Wait until confirmation */
do {
rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL);
- schedule_timeout_uninterruptible(msecs_to_jiffies(1));
+ schedule_msec_hrtimeout_uninterruptible((1));
} while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS);
if (rtc_ctrl & WM8350_RTC_ALMSTS)
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index eefbb9b2279848..ab7d7869dd1219 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -5829,7 +5829,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id,
tgt_id, lun_id, context);
later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies;
while (time_after(later, jiffies) && cnt) {
- schedule_timeout_uninterruptible(msecs_to_jiffies(20));
+ schedule_msec_hrtimeout_uninterruptible((20));
cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context);
}
if (cnt) {
diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c
index cdc4b610625266..159c40ec680db5 100644
--- a/sound/pci/maestro3.c
+++ b/sound/pci/maestro3.c
@@ -1990,7 +1990,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip)
outw(0, io + GPIO_DATA);
outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION);
- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1));
+ schedule_msec_hrtimeout_uninterruptible((delay1));
outw(GPO_PRIMARY_AC97, io + GPIO_DATA);
udelay(5);
@@ -1998,7 +1998,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip)
outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A);
outw(~0, io + GPIO_MASK);
- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2));
+ schedule_msec_hrtimeout_uninterruptible((delay2));
if (! snd_m3_try_read_vendor(chip))
break;
diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c
index 3000bc128b5bcb..d9b64d61bd865e 100644
--- a/sound/soc/codecs/rt5631.c
+++ b/sound/soc/codecs/rt5631.c
@@ -417,7 +417,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena
hp_zc = snd_soc_component_read(component, RT5631_INT_ST_IRQ_CTRL_2);
snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff);
if (enable) {
- schedule_timeout_uninterruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_uninterruptible((10));
/* config one-bit depop parameter */
rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f);
snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL,
@@ -529,7 +529,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable
hp_zc = snd_soc_component_read(component, RT5631_INT_ST_IRQ_CTRL_2);
snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff);
if (enable) {
- schedule_timeout_uninterruptible(msecs_to_jiffies(10));
+ schedule_msec_hrtimeout_uninterruptible((10));
/* config depop sequence parameter */
rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f);
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 91bf939d5233ea..1bf7118d96a4c9 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm)
static void pop_wait(u32 pop_time)
{
if (pop_time)
- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time));
+ schedule_msec_hrtimeout_uninterruptible((pop_time));
}
__printf(3, 4)
From 034835119f000f95d9c25167d49acc62f8436249 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Mon, 20 Feb 2017 13:32:58 +1100
Subject: [PATCH 08/34] Don't use hrtimer overlay when pm_freezing since some
drivers still don't correctly use freezable timeouts.
---
kernel/time/hrtimer.c | 2 +-
kernel/time/timer.c | 9 +++++----
2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 46757e9c60e115..dd8dd73da8e0eb 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2258,7 +2258,7 @@ long __sched schedule_msec_hrtimeout(long timeout)
* (yet) better than Hz, as would occur during startup, use regular
* timers.
*/
- if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ)
+ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing)
return schedule_timeout(jiffs);
secs = timeout / 1000;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d4a9e45e0e2346..f6c069896336d3 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -44,6 +44,7 @@
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>
+#include <linux/freezer.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -2062,12 +2063,12 @@ void msleep(unsigned int msecs)
* Use high resolution timers where the resolution of tick based
* timers is inadequate.
*/
- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) {
+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) {
while (msecs)
msecs = schedule_msec_hrtimeout_uninterruptible(msecs);
return;
}
- timeout = msecs_to_jiffies(msecs) + 1;
+ timeout = jiffs + 1;
while (timeout)
timeout = schedule_timeout_uninterruptible(timeout);
@@ -2084,12 +2085,12 @@ unsigned long msleep_interruptible(unsigned int msecs)
int jiffs = msecs_to_jiffies(msecs);
unsigned long timeout;
- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) {
+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) {
while (msecs && !signal_pending(current))
msecs = schedule_msec_hrtimeout_interruptible(msecs);
return msecs;
}
- timeout = msecs_to_jiffies(msecs) + 1;
+ timeout = jiffs + 1;
while (timeout && !signal_pending(current))
timeout = schedule_timeout_interruptible(timeout);
From 3eac16a1bd6be4a1ab20e8b6476da844f0c75ae5 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Tue, 22 Nov 2016 10:15:51 +1100
Subject: [PATCH 09/34] Make hrtimer granularity and minimum hrtimeout
configurable in sysctl. Set default granularity to 100us and min timeout to
500us.
---
kernel/sysctl.c | 23 ++++++++++++++++--
kernel/time/clockevents.c | 8 ++-----
kernel/time/hrtimer.c | 49 +++++++++++++++++++++++++++++++++++----
3 files changed, 68 insertions(+), 12 deletions(-)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2020833509675..e3908310e5cc45 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -128,7 +128,9 @@ extern int sched_interactive;
extern int sched_iso_cpu;
extern int sched_yield_type;
#endif
-#ifdef CONFIG_PRINTK
+extern int hrtimer_granularity_us;
+extern int hrtimeout_min_us;
+#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS)
static int ten_thousand = 10000;
#endif
#ifdef CONFIG_PERF_EVENTS
@@ -1903,7 +1905,24 @@ static struct ctl_table kern_table[] = {
},
#endif /* CONFIG_SMP && CONFIG_SCHEDSTATS */
#endif /* CONFIG_SCHED_MUQSS */
-
+ {
+ .procname = "hrtimer_granularity_us",
+ .data = &hrtimer_granularity_us,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &ten_thousand,
+ },
+ {
+ .procname = "hrtimeout_min_us",
+ .data = &hrtimeout_min_us,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &ten_thousand,
+ },
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
{
.procname = "sched_energy_aware",
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 7a61971cca7409..544c58c29267f3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -190,13 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev)
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
-#ifdef CONFIG_SCHED_MUQSS
+int __read_mostly hrtimer_granularity_us = 100;
/* Limit min_delta to 100us */
-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000)
-#else
-/* Limit min_delta to a jiffie */
-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
-#endif
+#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC)
/**
* clockevents_increase_min_delta - raise minimum delta of a clock event device
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index dd8dd73da8e0eb..95d4f24b3afca3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2244,7 +2244,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout);
long __sched schedule_msec_hrtimeout(long timeout)
{
struct hrtimer_sleeper t;
- int delta, secs, jiffs;
+ int delta, jiffs;
ktime_t expires;
if (!timeout) {
@@ -2261,9 +2261,8 @@ long __sched schedule_msec_hrtimeout(long timeout)
if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing)
return schedule_timeout(jiffs);
- secs = timeout / 1000;
delta = (timeout % 1000) * NSEC_PER_MSEC;
- expires = ktime_set(secs, delta);
+ expires = ktime_set(0, delta);
hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer_set_expires_range_ns(&t.timer, expires, delta);
@@ -2285,9 +2284,51 @@ long __sched schedule_msec_hrtimeout(long timeout)
EXPORT_SYMBOL(schedule_msec_hrtimeout);
+#define USECS_PER_SEC 1000000
+extern int hrtimer_granularity_us;
+
+static inline long schedule_usec_hrtimeout(long timeout)
+{
+ struct hrtimer_sleeper t;
+ ktime_t expires;
+ int delta;
+
+ if (!timeout) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ if (hrtimer_resolution >= NSEC_PER_SEC / HZ)
+ return schedule_timeout(usecs_to_jiffies(timeout));
+
+ if (timeout < hrtimer_granularity_us)
+ timeout = hrtimer_granularity_us;
+ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC;
+ expires = ktime_set(0, delta);
+
+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_set_expires_range_ns(&t.timer, expires, delta);
+
+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL);
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ expires = hrtimer_expires_remaining(&t.timer);
+ timeout = ktime_to_us(expires);
+ return timeout < 0 ? 0 : timeout;
+}
+
+int __read_mostly hrtimeout_min_us = 500;
+
long __sched schedule_min_hrtimeout(void)
{
- return schedule_msec_hrtimeout(1);
+ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us));
}
EXPORT_SYMBOL(schedule_min_hrtimeout);
From 86f1a40495e4d396c8e63437450dec041f57d0c9 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Wed, 5 Sep 2018 12:30:38 +0200
Subject: [PATCH 12/34] muqss: Restore fixes broken by hrtimeout patches
---
drivers/accessibility/speakup/synth.c | 6 +++++-
drivers/iio/light/tsl2563.c | 6 +++++-
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/accessibility/speakup/synth.c b/drivers/accessibility/speakup/synth.c
index bf0cbdaf564f80..1dd3710771ba5d 100644
--- a/drivers/accessibility/speakup/synth.c
+++ b/drivers/accessibility/speakup/synth.c
@@ -93,7 +93,11 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode)
spin_unlock_irqrestore(&speakup_info.spinlock, flags);
if (ch == '\n')
ch = synth->procspeech;
- if (!synth->io_ops->synth_out(synth, ch)) {
+ if (unicode)
+ ret = synth->io_ops->synth_out_unicode(synth, ch);
+ else
+ ret = synth->io_ops->synth_out(synth, ch);
+ if (!ret) {
schedule_msec_hrtimeout(full_time_val);
continue;
}
diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c
index 6ce37819fb73f0..921f86fcd4edf5 100644
--- a/drivers/iio/light/tsl2563.c
+++ b/drivers/iio/light/tsl2563.c
@@ -271,7 +271,11 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip)
default:
delay = 402;
}
- schedule_msec_hrtimeout_interruptible(delay + 1);
+ /*
+ * TODO: Make sure that we wait at least required delay but why we
+ * have to extend it one tick more?
+ */
+ schedule_msec_hrtimeout_interruptible(delay + 2);
}
static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc)
From 958ee0ec7df4e443989889933ba4b463000b5ba5 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Wed, 5 Sep 2018 12:42:44 +0200
Subject: [PATCH 13/34] muqss: Make hrtimeout patches depend on MuQSS
---
include/linux/sched.h | 5 +++--
kernel/sysctl.c | 6 +++---
kernel/time/clockevents.c | 5 +++++
kernel/time/hrtimer.c | 2 ++
kernel/time/timer.c | 23 +++++++++++++----------
5 files changed, 26 insertions(+), 15 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11a5a6054ac7ac..35654025752d29 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -220,14 +220,15 @@ struct task_group;
extern void scheduler_tick(void);
-#define MAX_SCHEDULE_TIMEOUT LONG_MAX
+#define MAX_SCHEDULE_TIMEOUT LONG_MAX
+
extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
-#ifdef CONFIG_HIGH_RES_TIMERS
+#if defined(CONFIG_HIGH_RES_TIMERS) && defined(CONFIG_SCHED_MUQSS)
extern long schedule_msec_hrtimeout(long timeout);
extern long schedule_min_hrtimeout(void);
extern long schedule_msec_hrtimeout_interruptible(long timeout);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e3908310e5cc45..e5e40007aa3561 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -120,16 +120,16 @@ static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int two_hundred = 200;
static int one_thousand = 1000;
+#ifdef CONFIG_SCHED_MUQSS
static int zero = 0;
static int one = 1;
-#ifdef CONFIG_SCHED_MUQSS
extern int rr_interval;
extern int sched_interactive;
extern int sched_iso_cpu;
extern int sched_yield_type;
-#endif
extern int hrtimer_granularity_us;
extern int hrtimeout_min_us;
+#endif
#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS)
static int ten_thousand = 10000;
#endif
@@ -1904,7 +1904,6 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SMP && CONFIG_SCHEDSTATS */
-#endif /* CONFIG_SCHED_MUQSS */
{
.procname = "hrtimer_granularity_us",
.data = &hrtimer_granularity_us,
@@ -1923,6 +1922,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
.extra2 = &ten_thousand,
},
+#endif /* CONFIG_SCHED_MUQSS */
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
{
.procname = "sched_energy_aware",
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 544c58c29267f3..3b3bf431f14e69 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -190,9 +190,14 @@ int clockevents_tick_resume(struct clock_event_device *dev)
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
+#ifdef CONFIG_SCHED_MUQSS
int __read_mostly hrtimer_granularity_us = 100;
/* Limit min_delta to 100us */
#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC)
+#else
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
+#endif
/**
* clockevents_increase_min_delta - raise minimum delta of a clock event device
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 95d4f24b3afca3..2045eb35e3dd11 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2237,6 +2237,7 @@ int __sched schedule_hrtimeout(ktime_t *expires,
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout);
+#ifdef CONFIG_SCHED_MUQSS
/*
* As per schedule_hrtimeout but taskes a millisecond value and returns how
* many milliseconds are left.
@@ -2346,3 +2347,4 @@ long __sched schedule_msec_hrtimeout_uninterruptible(long timeout)
return schedule_msec_hrtimeout(timeout);
}
EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible);
+#endif /* CONFIG_SCHED_MUQSS */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1e85fa6dfb2f37..a9fef079a5cb07 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1894,7 +1894,7 @@ signed long __sched schedule_timeout(signed long timeout)
expire = timeout + jiffies;
-#ifdef CONFIG_HIGH_RES_TIMERS
+#if defined(CONFIG_HIGH_RES_TIMERS) && defined(CONFIG_SCHED_MUQSS)
if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) {
/*
* Special case 1 as being a request for the minimum timeout
@@ -1914,10 +1914,13 @@ signed long __sched schedule_timeout(signed long timeout)
/* Remove the timer from the object tracker */
destroy_timer_on_stack(&timer.timer);
+
+#if defined(CONFIG_HIGH_RES_TIMERS) && defined(CONFIG_SCHED_MUQSS)
out_timeout:
+#endif
timeout = expire - jiffies;
-out:
+ out:
return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);
@@ -2060,19 +2063,19 @@ void __init init_timers(void)
*/
void msleep(unsigned int msecs)
{
- int jiffs = msecs_to_jiffies(msecs);
- unsigned long timeout;
+ unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+#ifdef CONFIG_SCHED_MUQSS
/*
* Use high resolution timers where the resolution of tick based
* timers is inadequate.
*/
- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) {
+ if (timeout < 6 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) {
while (msecs)
msecs = schedule_msec_hrtimeout_uninterruptible(msecs);
return;
}
- timeout = jiffs + 1;
+#endif
while (timeout)
timeout = schedule_timeout_uninterruptible(timeout);
@@ -2086,15 +2089,15 @@ EXPORT_SYMBOL(msleep);
*/
unsigned long msleep_interruptible(unsigned int msecs)
{
- int jiffs = msecs_to_jiffies(msecs);
- unsigned long timeout;
+ unsigned long timeout = msecs_to_jiffies(msecs) + 1;
- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) {
+#ifdef CONFIG_SCHED_MUQSS
+ if (timeout < 6 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) {
while (msecs && !signal_pending(current))
msecs = schedule_msec_hrtimeout_interruptible(msecs);
return msecs;
}
- timeout = jiffs + 1;
+#endif
while (timeout && !signal_pending(current))
timeout = schedule_timeout_interruptible(timeout);
From 3aae9e40df5b31eddc7a5a55f1d7a35e27fea1d3 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Thu, 29 Apr 2021 14:39:35 -0500
Subject: [PATCH 14/34] muqss: Fix double schedule timeout block/swim
Source:
https://ck-hack.blogspot.com/2021/04/linux-512-ck1-muqss-version-0210-for.html?showComment=1619687912361#c5130193842478724245
---
drivers/block/swim.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index fcad8d963cd893..e8ab61f8df3ddb 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -371,7 +371,6 @@ static inline int swim_step(struct swim __iomem *base)
for (wait = 0; wait < HZ; wait++) {
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1);
schedule_min_hrtimeout();
swim_select(base, RELAX);
From c60d85512e10f3fe7a503974d30988d631408324 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 12 Sep 2020 17:32:14 -0500
Subject: [PATCH 16/34] ZEN: INTERACTIVE: Set MuQSS' rr_interval to 4ms
With some quick benchmarks on Shadow Of The Tomb Raider, 4ms got me the
best 99th percentile scores on MuQSS on an overclocked 5960x.
Previous commit messages:
ZEN: INTERACTIVE: Set MuQSS' rr_interval to 2ms
With experiments to yield_type, I found that setting yield_type to 0
(making sched_yield a no-op), emulators like RPCS3 run far faster.
However, this actually does seem to affect the general system. Booting
fresh into KDE results in mouse stutter and a loss of general
responsiveness when many applications are starting.
Lets fix this by reducing the rr_interval to 2ms. This will lead into
our next change where we set yield_type to 2, which always recalculates
the next deadline that the process must be run again in.
---
kernel/sched/MuQSS.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index 1fdb664895dce4..2c01d0345a4826 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -131,7 +131,11 @@ void print_scheduler_version(void)
* Value is in ms and set to a minimum of 6ms.
* Tunable via /proc interface.
*/
+#ifdef CONFIG_ZEN_INTERACTIVE
+int rr_interval __read_mostly = 4;
+#else
int rr_interval __read_mostly = 6;
+#endif
/*
* Tunable to choose whether to prioritise latency or throughput, simple
From 56f24ae800a796c1b04b2a9e5d6c3aa717a5ef61 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 4 Jan 2020 19:14:45 -0600
Subject: [PATCH 17/34] ZEN: INTERACTIVE: Set MuQSS' yield_type to 0 (don't
yield)
Turns out when I investigated performance issues with RPCS3 on Linux
with MuQSS, my choice to set yield_type to 2 was flawed since I didn't
benchmark or any other applications that cared about it.
Phoronix wrote an article measure performance of Liquorix against a
stock 5.4 configuration here: https://www.phoronix.com/vr.php?view=28750
All the benchmarks measured framerate and Liquorix for the most part got
up to 20% less FPS than stock CFS, depending on the game. Turns out
some of it had to do with yield_type, and always yielding when requested
dropped minimum frame times quite a bit. Disabling yield entirely
raised the average frame rate a bit and the minimum frametimes on Deus
Ex: Mankind Divided by nearly 10%.
Also, Linus Torvalds wrote in a forum about sched_yield. He indicated
that yield used to make sense on uniprocessor configurations, but now
with multi-core being the norm, yield almost always causes performance
issues due to cache thrashing and thread/process migration on multicore
systems: https://www.realworldtech.com/forum/?threadid=189711&curpostid=189752
And finally, even if we don't yield, MuQSS will reschedule the thread
that's spinning anyway. All setting yield_type to 2 did was reschedule
the thread sooner. Lets let MuQSS decide when a thread needs to be
rescheduled, not the program.
Previous commit messages:
muqss, zen: Set yield_type to 2
In my previous attempt in making games and emulators behave better, I
found that making sched_yield a no-op by setting yield_type to 0 caused
emulators like RPCS3 to perform far better without frame drops.
Unfortunately, on my next boot I noticed the whole system felt less
responsive in general.
Fortunately, _always_ expiring the timeslice of the yielding process and
calculating the next deadline also fixes this weird performance anomaly.
Setting yield_type to 2 with a reduced rr_interval (2ms), should help
offset the disadvantages of rescheduling a process that has yielded.
muqss, zen: Disable sched_yield by default
While experimenting with audio stutter and frame drops through the RPCS3
emulator, I found that yet again, sched_yield is the source of
unpredictable performance drops. Reading the original post[1] by Con
where he outlined his last (final?) change to sched_yield, he changed
the behavior of sched_yield in MuQSS to only yield to better priority or
deadline tasks.
It seems then on my system, emulators and games are yielding to some
number of higher priority / next deadline tasks and performing much
worse. If you look at the comments on the blog post, however, many
people found that setting yield_type to 0 improve performance on their
games.
Lets just turn sched_yield into a noop for now and see how that fairs.
Worst case we'll receive a report about an application behaving badly,
and best case everyone will get a net benefit in gaming and emulators.
[1] http://ck-hack.blogspot.com/2016/12/linux-49-ck1-muqss-version-0150.html
Revert "muqss, zen: Disable sched_yield by default"
This reverts commit 8ec985bcd0ded1bcdd8bb999c90c094dc9536a0b.
This change, as expected, has strange side-effects with system
responsiveness when many applications are launching at boot. Also at
random times, the mouse becomes unresponsive while over saturating CPU
bandwidth.
---
kernel/sched/MuQSS.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index 2c01d0345a4826..949ee9fe7c11b1 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -160,7 +160,12 @@ int sched_iso_cpu __read_mostly = 70;
* 1: Yield only to better priority/deadline tasks. (default)
* 2: Expire timeslice and recalculate deadline.
*/
+
+#ifdef CONFIG_ZEN_INTERACTIVE
+int sched_yield_type __read_mostly = 0;
+#else
int sched_yield_type __read_mostly = 1;
+#endif
/*
* The relative length of deadline for each priority(nice) level.
From 4870f6578527234b920e5849f107156f6dcd8ef5 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 11:59:44 -0500
Subject: [PATCH 18/34] muqss: Backport "cpufreq: CPPC: Add support for
frequency invariance"
---
kernel/sched/MuQSS.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index 949ee9fe7c11b1..9e93a84eef5d23 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -5534,6 +5534,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
From 7a04658ede1ed70d61042a3e63f67ed070bc45e5 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:02:21 -0500
Subject: [PATCH 19/34] muqss: Backport "static_call: Relax
static_call_update() function argument type"
---
kernel/sched/MuQSS.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index 9e93a84eef5d23..d62263130dbe34 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -4691,25 +4691,25 @@ static void sched_dynamic_update(int mode)
switch (mode) {
case preempt_dynamic_none:
static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
- static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
- static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
- static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+ static_call_update(might_resched, (void *)&__static_call_return0);
+ static_call_update(preempt_schedule, NULL);
+ static_call_update(preempt_schedule_notrace, NULL);
+ static_call_update(irqentry_exit_cond_resched, NULL);
pr_info("Dynamic Preempt: none\n");
break;
case preempt_dynamic_voluntary:
static_call_update(cond_resched, __cond_resched);
static_call_update(might_resched, __cond_resched);
- static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
- static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
- static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+ static_call_update(preempt_schedule, NULL);
+ static_call_update(preempt_schedule_notrace, NULL);
+ static_call_update(irqentry_exit_cond_resched, NULL);
pr_info("Dynamic Preempt: voluntary\n");
break;
case preempt_dynamic_full:
- static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0);
- static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(cond_resched, (void *)&__static_call_return0);
+ static_call_update(might_resched, (void *)&__static_call_return0);
static_call_update(preempt_schedule, __preempt_schedule_func);
static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
From f61916903d2db4501537bf3ff5a9e253c8cc70bb Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:07:47 -0500
Subject: [PATCH 20/34] muqss: Backport "psi: Use ONCPU state tracking
machinery to detect reclaim"
---
kernel/sched/MuQSS.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index d62263130dbe34..c0bef0f211b512 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -3769,7 +3769,6 @@ void scheduler_tick(void)
task_tick(rq);
rq->last_scheduler_tick = rq->last_jiffy;
rq->last_tick = rq->clock;
- psi_task_tick(rq);
perf_event_task_tick();
sched_stop_tick(rq, cpu);
}
From ec78719fb255d4d4c126fac63976b6abd42d5180 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:11:27 -0500
Subject: [PATCH 21/34] muqss: Backport "sched/core: Stop using magic values in
sched_dynamic_mode()"
---
kernel/sched/MuQSS.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index c0bef0f211b512..d91d8932e403df 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -4664,13 +4664,13 @@ static int preempt_dynamic_mode = preempt_dynamic_full;
static int sched_dynamic_mode(const char *str)
{
if (!strcmp(str, "none"))
- return 0;
+ return preempt_dynamic_none;
if (!strcmp(str, "voluntary"))
- return 1;
+ return preempt_dynamic_voluntary;
if (!strcmp(str, "full"))
- return 2;
+ return preempt_dynamic_full;
return -1;
}
From f98914173f2145289ffe767be44d500cdf07f8e2 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:12:07 -0500
Subject: [PATCH 22/34] muqss: Backport "sched/core: Use -EINVAL in
sched_dynamic_mode()"
---
kernel/sched/MuQSS.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index d91d8932e403df..cd414e569e7f9a 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -4672,7 +4672,7 @@ static int sched_dynamic_mode(const char *str)
if (!strcmp(str, "full"))
return preempt_dynamic_full;
- return -1;
+ return -EINVAL;
}
static void sched_dynamic_update(int mode)
From 63ae1c65c7625201d641b9a8227c072c671be7b6 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:16:35 -0500
Subject: [PATCH 23/34] muqss: Backport "sched: Use cpu_dying() to fix
balance_push vs hotplug-rollback"
---
kernel/sched/MuQSS.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index cd414e569e7f9a..fbd1c75620220b 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -1620,6 +1620,10 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
if (kthread_is_per_cpu(p))
return cpu_online(cpu);
+ /* Regular kernel threads don't get to stay during offline. */
+ if (cpu_dying(cpu))
+ return false;
+
/* But are allowed during online. */
return cpu_online(cpu);
}
From 485abe581fd8ca16d1e630f1f7942e946a50524a Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:18:57 -0500
Subject: [PATCH 24/34] muqss: Backport "sched: Move SCHED_DEBUG sysctl to
debugfs"
---
kernel/sched/MuQSS.c | 4 +++-
kernel/sched/MuQSS.h | 2 ++
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index fbd1c75620220b..8eaa56a70c63ee 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -4796,9 +4796,11 @@ static const struct file_operations sched_dynamic_fops = {
.release = single_release,
};
+extern struct dentry *debugfs_sched;
+
static __init int sched_init_debug_dynamic(void)
{
- debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+ debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
return 0;
}
late_initcall(sched_init_debug_dynamic);
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
index e8b42add3a7fe0..2826a7b46d801b 100644
--- a/kernel/sched/MuQSS.h
+++ b/kernel/sched/MuQSS.h
@@ -701,6 +701,8 @@ static inline void unregister_sched_domain_sysctl(void)
}
#endif
+extern int sched_update_scaling(void);
+
extern void flush_smp_call_function_from_idle(void);
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
From 5e7ae5fe5ed27af264c163d8cee0f0f0d4afb546 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:23:12 -0500
Subject: [PATCH 25/34] muqss: Backport "sched,preempt: Move preempt_dynamic to
debug.c"
---
kernel/sched/MuQSS.c | 76 ++------------------------------------------
kernel/sched/MuQSS.h | 10 ++++--
2 files changed, 11 insertions(+), 75 deletions(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index 8eaa56a70c63ee..80ce0b730bf099 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -4663,9 +4663,9 @@ enum {
preempt_dynamic_full,
};
-static int preempt_dynamic_mode = preempt_dynamic_full;
+int preempt_dynamic_mode = preempt_dynamic_full;
-static int sched_dynamic_mode(const char *str)
+int sched_dynamic_mode(const char *str)
{
if (!strcmp(str, "none"))
return preempt_dynamic_none;
@@ -4679,7 +4679,7 @@ static int sched_dynamic_mode(const char *str)
return -EINVAL;
}
-static void sched_dynamic_update(int mode)
+void sched_dynamic_update(int mode)
{
/*
* Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
@@ -4736,76 +4736,6 @@ static int __init setup_preempt_mode(char *str)
}
__setup("preempt=", setup_preempt_mode);
-#ifdef CONFIG_SCHED_DEBUG
-
-static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[16];
- int mode;
-
- if (cnt > 15)
- cnt = 15;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- mode = sched_dynamic_mode(strstrip(buf));
- if (mode < 0)
- return mode;
-
- sched_dynamic_update(mode);
-
- *ppos += cnt;
-
- return cnt;
-}
-
-static int sched_dynamic_show(struct seq_file *m, void *v)
-{
- static const char * preempt_modes[] = {
- "none", "voluntary", "full"
- };
- int i;
-
- for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
- if (preempt_dynamic_mode == i)
- seq_puts(m, "(");
- seq_puts(m, preempt_modes[i]);
- if (preempt_dynamic_mode == i)
- seq_puts(m, ")");
-
- seq_puts(m, " ");
- }
-
- seq_puts(m, "\n");
- return 0;
-}
-
-static int sched_dynamic_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, sched_dynamic_show, NULL);
-}
-
-static const struct file_operations sched_dynamic_fops = {
- .open = sched_dynamic_open,
- .write = sched_dynamic_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-extern struct dentry *debugfs_sched;
-
-static __init int sched_init_debug_dynamic(void)
-{
- debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
- return 0;
-}
-late_initcall(sched_init_debug_dynamic);
-
-#endif /* CONFIG_SCHED_DEBUG */
#endif /* CONFIG_PREEMPT_DYNAMIC */
/*
* This is the entry point to schedule() from kernel preemption
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
index 2826a7b46d801b..53142866c3481c 100644
--- a/kernel/sched/MuQSS.h
+++ b/kernel/sched/MuQSS.h
@@ -1056,8 +1056,14 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
}
#endif
-void swake_up_all_locked(struct swait_queue_head *q);
-void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void swake_up_all_locked(struct swait_queue_head *q);
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+extern int preempt_dynamic_mode;
+extern int sched_dynamic_mode(const char *str);
+extern void sched_dynamic_update(int mode);
+#endif
/* pelt.h compat CONFIG_SCHED_THERMAL_PRESSURE impossible with MUQSS */
static inline int
From 3de79719aa90d2a7c2406d70a582f0ca04478b14 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:27:26 -0500
Subject: [PATCH 26/34] muqss: Backport Revert "cpufreq: CPPC: Add support for
frequency invariance"
---
kernel/sched/MuQSS.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index 80ce0b730bf099..7940979f3e7804 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -5469,7 +5469,6 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
-EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
From 451a8886c2de194bbdcbb674ce27e456c0e1fbff Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:34:55 -0500
Subject: [PATCH 27/34] muqss: Backport "sched/fair: Trigger the update of
blocked load on newly idle cpu"
---
kernel/sched/idle.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2ed8f5ae4d8b2a..46e79fb4bc467e 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -265,7 +265,9 @@ static void do_idle(void)
/*
* Check if we need to update blocked load
*/
+#ifndef CONFIG_SCHED_MUQSS
nohz_run_idle_balance(cpu);
+#endif
/*
* If the arch has a polling bit, we maintain an invariant:
From a545ec822c718d108c08e29a79c32f03221f743b Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:37:58 -0500
Subject: [PATCH 28/34] muqss: Backport "sched,debug: Convert sysctl
sched_domains to debugfs"
---
kernel/sched/MuQSS.h | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
index 53142866c3481c..ae12c0c7cd2b7b 100644
--- a/kernel/sched/MuQSS.h
+++ b/kernel/sched/MuQSS.h
@@ -685,20 +685,16 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
}
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-void register_sched_domain_sysctl(void);
+#ifdef CONFIG_SCHED_DEBUG
+void update_sched_domain_debugfs(void);
void dirty_sched_domain_sysctl(int cpu);
-void unregister_sched_domain_sysctl(void);
#else
-static inline void register_sched_domain_sysctl(void)
+static inline void update_sched_domain_debugfs(void)
{
}
static inline void dirty_sched_domain_sysctl(int cpu)
{
}
-static inline void unregister_sched_domain_sysctl(void)
-{
-}
#endif
extern int sched_update_scaling(void);
@@ -1081,4 +1077,6 @@ static inline u64 thermal_load_avg(struct rq *rq)
extern int sysctl_sched_rt_runtime;
#endif
+
+
#endif /* MUQSS_SCHED_H */
From 9bb579b580943867b1f24a29d6515ab1af07eb1d Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 12:54:19 -0500
Subject: [PATCH 29/34] muqss: Sync cpu_rq/cpu_curr from sched.h
---
kernel/sched/MuQSS.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
index ae12c0c7cd2b7b..62e01b82600239 100644
--- a/kernel/sched/MuQSS.h
+++ b/kernel/sched/MuQSS.h
@@ -382,9 +382,11 @@ extern struct rq *uprq;
#define cpu_curr(cpu) ((uprq)->curr)
#else /* CONFIG_SMP */
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() this_cpu_ptr(&runqueues)
-#define raw_rq() raw_cpu_ptr(&runqueues)
#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#define raw_rq() raw_cpu_ptr(&runqueues)
#endif /* CONFIG_SMP */
static inline int task_current(struct rq *rq, struct task_struct *p)
From d99f60c26a53d618a47fc43380392152434a880f Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 7 Aug 2021 13:16:13 -0500
Subject: [PATCH 30/34] muqss: Remove cpu_rq/cpu_curr definition in MuQSS.c
---
kernel/sched/MuQSS.c | 6 ------
1 file changed, 6 deletions(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index 7940979f3e7804..ac25e76db1a9c9 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -246,12 +246,6 @@ static int total_runqueues __read_mostly = 1;
static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
-struct rq *cpu_rq(int cpu)
-{
- return &per_cpu(runqueues, (cpu));
-}
-#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-
/*
* For asym packing, by default the lower numbered cpu has higher priority.
*/
From 059ddaf54fa540e4ecd64f53507c7348678759b5 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Wed, 11 Aug 2021 18:57:00 -0500
Subject: [PATCH 31/34] muqss: Add missing "sched_debug_verbose" symbol to
MuQSS.h
Fixes: https://github.com/zen-kernel/zen-kernel/issues/234
---
kernel/sched/MuQSS.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
index 62e01b82600239..63935d778c50cb 100644
--- a/kernel/sched/MuQSS.h
+++ b/kernel/sched/MuQSS.h
@@ -749,6 +749,7 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
#ifdef CONFIG_SCHED_DEBUG
extern bool sched_debug_enabled;
+extern bool sched_debug_verbose;
#endif
extern void schedule_idle(void);
From 0364b7e40e5d20b0895e8b0c5ea76c7d40641fb7 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Fri, 13 Aug 2021 17:13:45 -0500
Subject: [PATCH 32/34] muqss: Make SCHED_DEBUG incompatible with MuQSS
MuQSS doesn't build properly since 5.13 when SCHED_DEBUG is enabled.
Considering MuQSS doesn't expose anything when this feature is on, lets
just turn it off when SCHED_MUQSS is selected.
Fixes: #234
---
lib/Kconfig.debug | 1 +
1 file changed, 1 insertion(+)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 678c13967580ec..b6f145792d49fe 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1164,6 +1164,7 @@ menu "Scheduler Debugging"
config SCHED_DEBUG
bool "Collect scheduler debugging info"
depends on DEBUG_KERNEL && PROC_FS
+ depends on !SCHED_MUQSS
default y
help
If you say Y here, the /proc/sched_debug file will be provided
From 6a2d3d9353938c3ece25a843b713f04e19655dfe Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 14 Aug 2021 11:30:52 -0500
Subject: [PATCH 33/34] muqss: Add MG-LRU changes through ifdef macro
As identified in #236, MG-LRU modifies CFS slightly as part of its
implementation. Make the equivalent changes in MuQSS with a C macro so
MuQSS can still be built independently even if the MG-LRU patchset is
not applied.
---
kernel/sched/MuQSS.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index ac25e76db1a9c9..b88b52abc179a0 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -3013,6 +3013,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+#ifdef CONFIG_LRU_GEN
+ lru_gen_switch_mm(prev->active_mm, next->mm);
+#endif
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
@@ -6949,6 +6952,9 @@ void idle_task_exit(void)
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
+#ifdef CONFIG_LRU_GEN
+ lru_gen_switch_mm(mm, &init_mm);
+#endif
finish_arch_post_lock_switch();
}
From cac2f053d7f4f70894e8b7a30e5d8b63be503562 Mon Sep 17 00:00:00 2001
From: Eduardo <edzis@inbox.lv>
Date: Mon, 5 Jul 2021 11:37:43 +0300
Subject: [PATCH 34/34] muqss: Fix documentation regarding MC LLC runqueue
sharing
---
.../admin-guide/kernel-parameters.txt | 3 ++-
Documentation/scheduler/sched-MuQSS.txt | 19 ++++++++++---------
2 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b97bafd6c124a5..b7eb15e44e5ed6 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4866,8 +4866,9 @@
Format: <string>
smt -- Share SMT (hyperthread) sibling runqueues
mc -- Share MC (multicore) sibling runqueues
+ llc -- Share MC (multicore) sibling runqueues with adjacent LLC
smp -- Share SMP runqueues
- none -- So not share any runqueues
+ none -- Do not share any runqueues
Default value is mc
rw [KNL] Mount root device read-write on boot
diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt
index ae28b85c999513..c4708a54520f3c 100644
--- a/Documentation/scheduler/sched-MuQSS.txt
+++ b/Documentation/scheduler/sched-MuQSS.txt
@@ -264,17 +264,18 @@ Runqueue sharing.
By default MuQSS chooses to share runqueue resources (specifically the skip
list and locking) between multicore siblings. It is configurable at build time
-to select between None, SMT, MC and SMP, corresponding to no sharing, sharing
-only between simultaneous mulithreading siblings, multicore siblings, or
-symmetric multiprocessing physical packages. Additionally it can be se at
-bootime with the use of the rqshare parameter. The reason for configurability
-is that some architectures have CPUs with many multicore siblings (>= 16)
-where it may be detrimental to throughput to share runqueues and another
-sharing option may be desirable. Additionally, more sharing than usual can
-improve latency on a system-wide level at the expense of throughput if desired.
+to select between None, SMT, MC, LLC and SMP, corresponding to no sharing,
+sharing only between simultaneous mulithreading siblings, multicore siblings,
+multicore siblings with adjacent LLC, or symmetric multiprocessing physical
+packages. Additionally it can be se at bootime with the use of the rqshare
+parameter. The reason for configurability is that some architectures have
+CPUs with many multicore siblings (>= 16) where it may be detrimental to
+throughput to share runqueues and another sharing option may be desirable.
+Additionally, more sharing than usual can improve latency on a system-wide
+level at the expense of throughput if desired.
The options are:
-none, smt, mc, smp
+none, smt, mc, llc, smp
eg:
rqshare=mc
From 4b382ba49343bebb160d99508552730bba1ea028 Mon Sep 17 00:00:00 2001
From: Eduardo <edzis@inbox.lv>
Date: Mon, 5 Jul 2021 21:53:34 +0300
Subject: [PATCH 1/4] muqss: Tune CPU selection v2
I was testing MuQSS for performance using MC, LLC and SMT and discovered that
it is not up to performance, at least on my machine. This commit fixes the
performance, at least on my machine (Ryzen 1700), I have no way of checking
that on any Intel machine as I don't have one I can test on, but it should
work similarly.
This patch modifies best CPU selection:
Adds a check whether not only the CPU in question is in the
idle mask, but it will check whether the mask contains its
SMT sibling too.
The rationale is that SMT shares all resources of the core
and if other task is already scheduled there, it's not
really optimal to schedule more tasks that share the same
SMT resources.
Patch also refines further best CPU selection and adds an
exit threshold.
The rationale there is that there is no need to search
further if the best selection is already found. The best
selection depends on localities processed and is mainly
targeted around SMT.
---
kernel/sched/MuQSS.c | 60 ++++++++++++++++++++++++++++++++------------
1 file changed, 44 insertions(+), 16 deletions(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index b88b52abc179a0..cdd18a8784faf1 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -1110,24 +1110,37 @@ static void resched_curr(struct rq *rq)
* Other node, other CPU, idle cache, idle threads.
* Other node, other CPU, busy cache, idle threads.
* Other node, other CPU, busy threads.
- */
+*/
static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
{
int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD;
- int cpu_tmp;
+ int cpu_tmp, best_poss_ranking;
+ struct rq *tmp_rq;
- if (cpumask_test_cpu(best_cpu, tmpmask))
- goto out;
+ if (cpumask_test_cpu(best_cpu, tmpmask)) {
+#ifdef CONFIG_SCHED_SMT
+ tmp_rq = cpu_rq(best_cpu);
+ if (tmp_rq->siblings_idle(tmp_rq) || !sched_smp_initialized)
+ return best_cpu;
+#else
+ return best_cpu;
+#endif
+ }
+#ifdef CONFIG_SCHED_SMT
+ best_poss_ranking = CPUIDLE_DIFF_THREAD;
+#elif CONFIG_SCHED_MC
+ best_poss_ranking = CPUIDLE_DIFF_CORE_LLC;
+#else
+ best_poss_ranking = CPUIDLE_DIFF_CPU;
+#endif
for_each_cpu(cpu_tmp, tmpmask) {
int ranking, locality;
- struct rq *tmp_rq;
ranking = 0;
tmp_rq = cpu_rq(cpu_tmp);
-
locality = rq->cpu_locality[cpu_tmp];
#ifdef CONFIG_NUMA
if (locality > LOCALITY_SMP)
@@ -1141,23 +1154,38 @@ static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
ranking |= CPUIDLE_DIFF_CORE_LLC;
else if (locality == LOCALITY_MC)
ranking |= CPUIDLE_DIFF_CORE;
- if (!(tmp_rq->cache_idle(tmp_rq)))
- ranking |= CPUIDLE_CACHE_BUSY;
#endif
#ifdef CONFIG_SCHED_SMT
- if (locality == LOCALITY_SMT)
- ranking |= CPUIDLE_DIFF_THREAD;
+ else if (locality == LOCALITY_SMT)
+ ranking |= CPUIDLE_DIFF_THREAD;
+#endif
+#ifdef CONFIG_SCHED_MC
+ if (ranking < best_ranking) {
+ if (!(tmp_rq->cache_idle(tmp_rq)))
+ ranking |= CPUIDLE_CACHE_BUSY;
+#endif
+#ifdef CONFIG_SCHED_SMT
+ if (ranking < best_ranking) {
+ if (!(tmp_rq->siblings_idle(tmp_rq))) {
+ ranking |= CPUIDLE_THREAD_BUSY;
+ if (locality == LOCALITY_SMT)
+ best_poss_ranking = CPUIDLE_DIFF_CORE_LLC;
+ }
#endif
- if (ranking < best_ranking
+ if (ranking < best_ranking) {
+ best_cpu = cpu_tmp;
+ best_ranking = ranking;
+ }
#ifdef CONFIG_SCHED_SMT
- || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq)))
+ }
#endif
- ) {
- best_cpu = cpu_tmp;
- best_ranking = ranking;
+#ifdef CONFIG_SCHED_MC
}
+#endif
+ if (best_ranking <= best_poss_ranking)
+ break;
}
-out:
+
return best_cpu;
}
From 64b68be9fb36e0f438e272bfd91846aaed5fc6cc Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Tue, 24 Aug 2021 09:48:16 -0500
Subject: [PATCH 2/4] Revert "muqss: Tune CPU selection v2"
Causes my Intel laptop to freeze randomly, and resuming from suspend
causes the load to overflow.
This reverts commit 4b382ba49343bebb160d99508552730bba1ea028.
---
kernel/sched/MuQSS.c | 60 ++++++++++++--------------------------------
1 file changed, 16 insertions(+), 44 deletions(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index cdd18a8784faf1..b88b52abc179a0 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -1110,37 +1110,24 @@ static void resched_curr(struct rq *rq)
* Other node, other CPU, idle cache, idle threads.
* Other node, other CPU, busy cache, idle threads.
* Other node, other CPU, busy threads.
-*/
+ */
static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
{
int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD;
- int cpu_tmp, best_poss_ranking;
- struct rq *tmp_rq;
+ int cpu_tmp;
- if (cpumask_test_cpu(best_cpu, tmpmask)) {
-#ifdef CONFIG_SCHED_SMT
- tmp_rq = cpu_rq(best_cpu);
- if (tmp_rq->siblings_idle(tmp_rq) || !sched_smp_initialized)
- return best_cpu;
-#else
- return best_cpu;
-#endif
- }
+ if (cpumask_test_cpu(best_cpu, tmpmask))
+ goto out;
-#ifdef CONFIG_SCHED_SMT
- best_poss_ranking = CPUIDLE_DIFF_THREAD;
-#elif CONFIG_SCHED_MC
- best_poss_ranking = CPUIDLE_DIFF_CORE_LLC;
-#else
- best_poss_ranking = CPUIDLE_DIFF_CPU;
-#endif
for_each_cpu(cpu_tmp, tmpmask) {
int ranking, locality;
+ struct rq *tmp_rq;
ranking = 0;
tmp_rq = cpu_rq(cpu_tmp);
+
locality = rq->cpu_locality[cpu_tmp];
#ifdef CONFIG_NUMA
if (locality > LOCALITY_SMP)
@@ -1154,38 +1141,23 @@ static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
ranking |= CPUIDLE_DIFF_CORE_LLC;
else if (locality == LOCALITY_MC)
ranking |= CPUIDLE_DIFF_CORE;
+ if (!(tmp_rq->cache_idle(tmp_rq)))
+ ranking |= CPUIDLE_CACHE_BUSY;
#endif
#ifdef CONFIG_SCHED_SMT
- else if (locality == LOCALITY_SMT)
- ranking |= CPUIDLE_DIFF_THREAD;
-#endif
-#ifdef CONFIG_SCHED_MC
- if (ranking < best_ranking) {
- if (!(tmp_rq->cache_idle(tmp_rq)))
- ranking |= CPUIDLE_CACHE_BUSY;
-#endif
-#ifdef CONFIG_SCHED_SMT
- if (ranking < best_ranking) {
- if (!(tmp_rq->siblings_idle(tmp_rq))) {
- ranking |= CPUIDLE_THREAD_BUSY;
- if (locality == LOCALITY_SMT)
- best_poss_ranking = CPUIDLE_DIFF_CORE_LLC;
- }
+ if (locality == LOCALITY_SMT)
+ ranking |= CPUIDLE_DIFF_THREAD;
#endif
- if (ranking < best_ranking) {
- best_cpu = cpu_tmp;
- best_ranking = ranking;
- }
+ if (ranking < best_ranking
#ifdef CONFIG_SCHED_SMT
- }
+ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq)))
#endif
-#ifdef CONFIG_SCHED_MC
+ ) {
+ best_cpu = cpu_tmp;
+ best_ranking = ranking;
}
-#endif
- if (best_ranking <= best_poss_ranking)
- break;
}
-
+out:
return best_cpu;
}
From 1fb7f0e32caf628f0e70480922ab606f2d859605 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Wed, 1 Sep 2021 19:00:11 -0500
Subject: [PATCH 3/4] muqss: Replace "while(42)" with "for (;;)" to match
mainline
Using "while(42)" was probably a funny replacement for "for (;;)" back
when the decision to use it was made, but all it does is make syncing
code from mainline frustrating.
Sync up with mainline to make porting easier.
---
kernel/sched/MuQSS.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
index 63935d778c50cb..a8f9b6087d94d2 100644
--- a/kernel/sched/MuQSS.h
+++ b/kernel/sched/MuQSS.h
@@ -455,7 +455,7 @@ static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf
{
struct rq *rq;
- while (42) {
+ for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
raw_spin_lock(rq->lock);
@@ -482,7 +482,7 @@ static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags _
lockdep_assert_held(&p->pi_lock);
- while (42) {
+ for (;;) {
rq = task_rq(p);
raw_spin_lock(rq->lock);
if (likely(rq == task_rq(p)))
From 00a9b574b607dc45c0435687d1f02dd0ab0deb6f Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Wed, 1 Sep 2021 19:22:43 -0500
Subject: [PATCH 4/4] muqss: Synchronize "(__)task_rq_lock" functions from
mainline
It looks like at some point mainline moved the locking functions
directly into the core.c file for CFS. MuQSS kept them in the header
with minor differences in implementation.
Sync them up so porting code is easier.
---
kernel/sched/MuQSS.c | 39 +++++++++++++++++++++++++++++++++++++++
kernel/sched/MuQSS.h | 38 +++++---------------------------------
2 files changed, 44 insertions(+), 33 deletions(-)
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index b88b52abc179a0..eff16baf8d0f21 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -277,6 +277,45 @@ struct rq *uprq;
* Looking up task_rq must be done under rq->lock to be safe.
*/
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(rq->lock);
+ if (likely(rq == task_rq(p)))
+ return rq;
+ raw_spin_unlock(rq->lock);
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
+ rq = task_rq(p);
+ raw_spin_lock(rq->lock);
+ if (likely(rq == task_rq(p)))
+ return rq;
+ raw_spin_unlock(rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+ }
+}
+
/*
* RQ-clock updating methods:
*/
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
index a8f9b6087d94d2..3db551b436b725 100644
--- a/kernel/sched/MuQSS.h
+++ b/kernel/sched/MuQSS.h
@@ -449,23 +449,12 @@ static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
raw_spin_unlock_irqrestore(rq->lock, rf->flags);
}
-static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(p->pi_lock)
- __acquires(rq->lock)
-{
- struct rq *rq;
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(rq->lock);
- for (;;) {
- raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
- rq = task_rq(p);
- raw_spin_lock(rq->lock);
- if (likely(rq == task_rq(p)))
- break;
- raw_spin_unlock(rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
- }
- return rq;
-}
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock);
static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
@@ -475,23 +464,6 @@ static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct r
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
-static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- lockdep_assert_held(&p->pi_lock);
-
- for (;;) {
- rq = task_rq(p);
- raw_spin_lock(rq->lock);
- if (likely(rq == task_rq(p)))
- break;
- raw_spin_unlock(rq->lock);
- }
- return rq;
-}
-
static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf)
{
rq_unlock(rq);