Compare commits

...

6 Commits

8 changed files with 10401 additions and 439 deletions

View File

@@ -3,7 +3,7 @@
# Linux distribution you are using, options are "Arch", "Ubuntu", "Debian", "Fedora", "Suse", "Gentoo", "Generic".
# It is automatically set to "Arch" when using PKGBUILD.
# If left empty, the script will prompt
_distro=""
_distro="Arch"
# Kernel Version - Options are "5.4", and from "5.7" to "5.19"
# you can also set a specific kernel version, e.g. "6.0-rc4" or "5.10.51",
@@ -46,7 +46,7 @@ CUSTOM_GCC_PATH=""
CUSTOM_LLVM_PATH=""
# Set to true to bypass makepkg.conf and use all available threads for compilation. False will respect your makepkg.conf options.
_force_all_threads="true"
_force_all_threads="false"
# Set to true to prevent ccache from being used and set CONFIG_GCC_PLUGINS=y (which needs to be disabled for ccache to work properly)
_noccache="false"
@@ -60,10 +60,10 @@ _modprobeddb="false"
_modprobeddb_db_path=~/.config/modprobed.db
# Set to "1" to call make menuconfig, "2" to call make nconfig, "3" to call make xconfig, before building the kernel. Set to false to disable and skip the prompt.
_menunconfig=""
_menunconfig="false"
# Set to true to generate a kernel config fragment from your changes in menuconfig/nconfig. Set to false to disable and skip the prompt.
_diffconfig=""
_diffconfig="false"
# Set to the file name where the generated config fragment should be written to. Only used if _diffconfig is active.
_diffconfig_name=""
@@ -97,11 +97,11 @@ _STRIP="true"
# LEAVE AN EMPTY VALUE TO BE PROMPTED ABOUT FOLLOWING OPTIONS AT BUILD TIME
# CPU scheduler - Options are "upds" (TkG's Undead PDS), "pds", "bmq", "muqss", "cacule", "tt", "bore" or "cfs" (kernel's default)
_cpusched=""
_cpusched="pds"
# Compiler to use - Options are "gcc" or "llvm".
# For advanced users.
_compiler=""
_compiler="gcc"
# Force the use of the LLVM Integrated Assembler whether using LLVM, LTO or not.
# Set to "1" to enable.
@@ -131,7 +131,7 @@ _preempt_rt_force=""
# For BMQ: 0: No yield.
# 1: Deboost and requeue task. (Default)
# 2: Set rq skip task.
_sched_yield_type=""
_sched_yield_type="0"
# Round Robin interval is the longest duration two tasks with the same nice level will be delayed for. When CPU time is requested by a task, it receives a time slice equal
# to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low value can help offset the disadvantages of rescheduling a process that has yielded.
@@ -139,7 +139,7 @@ _sched_yield_type=""
# PDS default: 4ms"
# BMQ default: 2ms"
# Set to "1" for 2ms, "2" for 4ms, "3" for 6ms, "4" for 8ms, or "default" to keep the chosen scheduler defaults.
_rr_interval=""
_rr_interval="2"
# Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false"
_ftracedisable="false"
@@ -154,10 +154,10 @@ _misc_adds="true"
# Full tickless can give higher performances in case you use isolation of CPUs for tasks
# and it works only when using the nohz_full kernel parameter, otherwise behaves like idle.
# Just tickless idle perform better for most platforms.
_tickless=""
_tickless="2"
# Set to "true" to use ACS override patch - https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29 - Kernel default is "false"
_acs_override=""
_acs_override="false"
# Set to "true" to add Bcache filesystem support. You'll have to install bcachefs-tools-git from AUR for utilities - https://bcachefs.org/ - If in doubt, set to "false"
# This can be buggy and isn't recommended on a production machine, also enabling this option will not allow you to enable MGLRU.
@@ -186,13 +186,13 @@ _futex_waitv="false"
_winesync="false"
# Set to "true" to enable Binder and Ashmem, the kernel modules required to use the android emulator Anbox. ! This doesn't apply to 5.4.y !
_anbox=""
_anbox="false"
# A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience (ZENIFY) - Default is "true"
_zenify="true"
# compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "1"
_compileroptlevel="1"
_compileroptlevel="2"
# CPU compiler optimizations - Defaults to prompt at kernel config if left empty
# AMD CPUs : "k8" "k8sse3" "k10" "barcelona" "bobcat" "jaguar" "bulldozer" "piledriver" "steamroller" "excavator" "zen" "zen2" "zen3" "zen4" (zen3 opt support depends on GCC11) (zen4 opt support depends on GCC13)
@@ -206,7 +206,7 @@ _compileroptlevel="1"
# - "generic_v2" (depends on GCC11 - to share the package between machines with different CPU µarch supporting at least x86-64-v2
# - "generic_v3" (depends on GCC11 - to share the package between machines with different CPU µarch supporting at least x86-64-v3
# - "generic_v4" (depends on GCC11 - to share the package between machines with different CPU µarch supporting at least x86-64-v4
_processor_opt=""
_processor_opt="skylake"
# MuQSS only - Make IRQ threading compulsory (FORCE_IRQ_THREADING) - Default is "false"
_irq_threading="false"
@@ -222,7 +222,7 @@ _cacule_rdb_interval="19"
_tt_high_hz="false"
# MuQSS and PDS only - SMT (Hyperthreading) aware nice priority and policy support (SMT_NICE) - Kernel default is "true" - You can disable this on non-SMT/HT CPUs for lower overhead
_smt_nice=""
_smt_nice="true"
# Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false"
_random_trust_cpu="true"
@@ -232,7 +232,7 @@ _random_trust_cpu="true"
_runqueue_sharing=""
# Timer frequency - "100" "250" "300" "500" "750" "1000" ("2000" is available for cacule cpusched only) - More options available in kernel config prompt when left empty depending on selected cpusched with the default option pointed with a ">" (2000 for cacule, 100 for muqss and 1000 for other cpu schedulers)
_timer_freq=""
_timer_freq="500"
# Default CPU governor - "performance", "ondemand", "schedutil" or leave empty for default (schedutil)
_default_cpu_gov="ondemand"

View File

@@ -1,6 +1,6 @@
#
# Automatically generated file; DO NOT EDIT.
# Linux/x86 6.2.0-rc1 Kernel Configuration
# Linux/x86 6.2.0-rc4 Kernel Configuration
#
CONFIG_CC_VERSION_TEXT="gcc (GCC) 12.2.0"
CONFIG_CC_IS_GCC=y
@@ -206,6 +206,7 @@ CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y
CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y
CONFIG_CC_HAS_INT128=y
CONFIG_CC_IMPLICIT_FALLTHROUGH="-Wimplicit-fallthrough=5"
CONFIG_GCC11_NO_ARRAY_BOUNDS=y
CONFIG_GCC12_NO_ARRAY_BOUNDS=y
CONFIG_CC_NO_ARRAY_BOUNDS=y
CONFIG_ARCH_SUPPORTS_INT128=y
@@ -241,6 +242,7 @@ CONFIG_UTS_NS=y
CONFIG_TIME_NS=y
CONFIG_IPC_NS=y
CONFIG_USER_NS=y
CONFIG_USER_NS_UNPRIVILEGED=y
CONFIG_PID_NS=y
CONFIG_NET_NS=y
CONFIG_CHECKPOINT_RESTORE=y
@@ -2546,6 +2548,7 @@ CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
CONFIG_BLK_DEV_RAM_COUNT=16
CONFIG_BLK_DEV_RAM_SIZE=16384
# CONFIG_CDROM_PKTCDVD is not set
CONFIG_ATA_OVER_ETH=m
CONFIG_XEN_BLKDEV_FRONTEND=m
CONFIG_XEN_BLKDEV_BACKEND=m

View File

@@ -284,7 +284,8 @@ _set_cpu_scheduler() {
elif [ "$_kver" = "601" ]; then
_avail_cpu_scheds=("cfs" "pds" "bmq" "tt" "bore")
elif [ "$_kver" = "602" ]; then
_avail_cpu_scheds=("cfs" "tt" "bore")
_avail_cpu_scheds=("cfs" "pds" "bmq" "tt" "bore")
_projectc_unoff=1
else
_avail_cpu_scheds=("cfs")
fi
@@ -722,7 +723,7 @@ _tkg_srcprep() {
elif [ "$_kver" = "518" ]; then
rev=2
elif [ "$_kver" = "601" ]; then
rev=1
rev=3
else
rev=0
fi
@@ -1572,9 +1573,13 @@ CONFIG_DEBUG_INFO_BTF_MODULES=y\r
fi
# NR_CPUS
if [ -n "$_NR_CPUS_value" ]; then
scripts/config --set-val "NR_CPUS" "$_NR_CPUS_value"
_enable "FORCE_NR_CPUS"
if [ "$_basever" != "601" ]; then
if [ -n "$_NR_CPUS_value" ]; then
scripts/config --set-val "NR_CPUS" "$_NR_CPUS_value"
_enable "FORCE_NR_CPUS"
fi
else
warning "NR_CPUS is bugged on 6.1.y, so your setting was ignored"
fi
fi

View File

@@ -5,7 +5,7 @@ index 42af9ca0127e..31747ec54f9d 100644
@@ -5406,6 +5406,12 @@
sa1100ir [NET]
See drivers/net/irda/sa1100_ir.c.
+ sched_timeslice=
+ [KNL] Time slice in ms for Project C BMQ/PDS scheduler.
+ Format: integer 2, 4
@@ -13,14 +13,14 @@ index 42af9ca0127e..31747ec54f9d 100644
+ See Documentation/scheduler/sched-BMQ.txt
+
sched_verbose [KNL] Enables verbose scheduler debug messages.
schedstats= [KNL,X86] Enable or disable scheduled statistics.
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 98d1b198b2b4..d7c78a107f93 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1552,3 +1552,13 @@ is 10 seconds.
The softlockup threshold is (``2 * watchdog_thresh``). Setting this
tunable to zero will disable lockup detection altogether.
+
@@ -161,7 +161,7 @@ index 9e479d7d202b..2a8530021b23 100644
+ (unsigned long long)tsk_seruntime(task),
(unsigned long long)task->sched_info.run_delay,
task->sched_info.pcount);
diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h
index 8874f681b056..59eb72bf7d5f 100644
--- a/include/asm-generic/resource.h
@@ -181,7 +181,7 @@ index ffb6eb55cd13..2e730a59caa2 100644
+++ b/include/linux/sched.h
@@ -762,8 +762,14 @@ struct task_struct {
unsigned int ptrace;
#ifdef CONFIG_SMP
- int on_cpu;
struct __call_single_node wake_entry;
@@ -202,11 +202,11 @@ index ffb6eb55cd13..2e730a59caa2 100644
+#endif /* !CONFIG_SCHED_ALT */
#endif
int on_rq;
@@ -785,6 +792,20 @@ struct task_struct {
int normal_prio;
unsigned int rt_priority;
+#ifdef CONFIG_SCHED_ALT
+ u64 last_ran;
+ s64 time_slice;
@@ -229,13 +229,13 @@ index ffb6eb55cd13..2e730a59caa2 100644
unsigned int core_occupation;
#endif
+#endif /* !CONFIG_SCHED_ALT */
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
@@ -1545,6 +1567,15 @@ struct task_struct {
*/
};
+#ifdef CONFIG_SCHED_ALT
+#define tsk_seruntime(t) ((t)->sched_time)
+/* replace the uncertian rt_timeout with 0UL */
@@ -254,7 +254,7 @@ index 7c83d4d5a971..fa30f98cb2be 100644
+++ b/include/linux/sched/deadline.h
@@ -1,5 +1,24 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_SCHED_ALT
+
+static inline int dl_task(struct task_struct *p)
@@ -282,7 +282,7 @@ index 7c83d4d5a971..fa30f98cb2be 100644
return dl_prio(p->prio);
}
+#endif /* CONFIG_SCHED_ALT */
static inline bool dl_time_before(u64 a, u64 b)
{
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
@@ -292,7 +292,7 @@ index ab83d85e1183..6af9ae681116 100644
@@ -18,6 +18,32 @@
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
+#ifdef CONFIG_SCHED_ALT
+
+/* Undefine MAX_PRIO and DEFAULT_PRIO */
@@ -327,7 +327,7 @@ index 994c25640e15..8c050a59ece1 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)
if (policy == SCHED_FIFO || policy == SCHED_RR)
return true;
+#ifndef CONFIG_SCHED_ALT
@@ -336,15 +336,15 @@ index 994c25640e15..8c050a59ece1 100644
+#endif
return false;
}
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 816df6cc444e..c8da08e18c91 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -234,7 +234,8 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
#endif /* !CONFIG_SMP */
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) && \
+ !defined(CONFIG_SCHED_ALT)
@@ -364,9 +364,9 @@ index 94125d3b6893..c87ba766d354 100644
This feature enables the scheduler to track the clamped utilization
of each CPU based on RUNNABLE tasks scheduled on that CPU.
@@ -865,6 +866,35 @@ config UCLAMP_BUCKETS_COUNT
If in doubt, use the default value.
+menuconfig SCHED_ALT
+ bool "Alternative CPU Schedulers"
+ default y
@@ -397,7 +397,7 @@ index 94125d3b6893..c87ba766d354 100644
+endif
+
endmenu
#
@@ -918,6 +948,7 @@ config NUMA_BALANCING
depends on ARCH_SUPPORTS_NUMA_BALANCING
@@ -410,7 +410,7 @@ index 94125d3b6893..c87ba766d354 100644
@@ -1015,6 +1046,7 @@ config FAIR_GROUP_SCHED
depends on CGROUP_SCHED
default CGROUP_SCHED
+if !SCHED_ALT
config CFS_BANDWIDTH
bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
@@ -418,13 +418,13 @@ index 94125d3b6893..c87ba766d354 100644
@@ -1037,6 +1069,7 @@ config RT_GROUP_SCHED
realtime bandwidth for them.
See Documentation/scheduler/sched-rt-group.rst for more information.
+endif #!SCHED_ALT
endif #CGROUP_SCHED
config UCLAMP_TASK_GROUP
@@ -1281,6 +1314,7 @@ config CHECKPOINT_RESTORE
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
+ depends on !SCHED_ALT
@@ -482,7 +482,7 @@ index c2f1fd95a821..41654679b1b2 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -117,7 +117,7 @@ config PREEMPT_DYNAMIC
config SCHED_CORE
bool "Core Scheduling for SMT"
- depends on SCHED_SMT
@@ -497,7 +497,7 @@ index b474289c15b8..a23224b45b03 100644
@@ -787,7 +787,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
return ret;
}
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT)
/*
@@ -522,9 +522,9 @@ index e39cb696cfbd..463423572e09 100644
t2 = tsk->sched_info.run_delay;
- t3 = tsk->se.sum_exec_runtime;
+ t3 = tsk_seruntime(tsk);
d->cpu_count += t1;
diff --git a/kernel/exit.c b/kernel/exit.c
index 35e0a31a0315..64e368441cf4 100644
--- a/kernel/exit.c
@@ -532,11 +532,11 @@ index 35e0a31a0315..64e368441cf4 100644
@@ -125,7 +125,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->curr_target = next_thread(tsk);
}
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+ add_device_randomness((const void*) &tsk_seruntime(tsk),
sizeof(unsigned long long));
/*
@@ -146,7 +146,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->inblock += task_io_get_inblock(tsk);
@@ -558,14 +558,14 @@ index 7779ee8abc2a..5b9893cdfb1b 100644
- waiter->deadline = task->dl.deadline;
+ waiter->deadline = __tsk_deadline(task);
}
/*
* Only use with rt_mutex_waiter_{less,equal}()
*/
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = __tsk_deadline(p) }
static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
struct rt_mutex_waiter *right)
{
@@ -574,7 +574,7 @@ index 7779ee8abc2a..5b9893cdfb1b 100644
+#else
if (left->prio < right->prio)
return 1;
+#ifndef CONFIG_SCHED_BMQ
/*
* If both waiters have dl_prio(), we check the deadlines of the
@@ -584,11 +584,11 @@ index 7779ee8abc2a..5b9893cdfb1b 100644
if (dl_prio(left->prio))
return dl_time_before(left->deadline, right->deadline);
+#endif
return 0;
+#endif
}
static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
struct rt_mutex_waiter *right)
{
@@ -597,7 +597,7 @@ index 7779ee8abc2a..5b9893cdfb1b 100644
+#else
if (left->prio != right->prio)
return 0;
+#ifndef CONFIG_SCHED_BMQ
/*
* If both waiters have dl_prio(), we check the deadlines of the
@@ -607,11 +607,11 @@ index 7779ee8abc2a..5b9893cdfb1b 100644
if (dl_prio(left->prio))
return left->deadline == right->deadline;
+#endif
return 1;
+#endif
}
static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 976092b7bd45..31d587c16ec1 100644
@@ -632,10 +632,10 @@ index 976092b7bd45..31d587c16ec1 100644
obj-y += build_utility.o
diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c
new file mode 100644
index 000000000000..572eab74418f
index 000000000000..acb8657e811d
--- /dev/null
+++ b/kernel/sched/alt_core.c
@@ -0,0 +1,7961 @@
@@ -0,0 +1,7978 @@
+/*
+ * kernel/sched/alt_core.c
+ *
@@ -665,7 +665,6 @@ index 000000000000..572eab74418f
+#include <linux/init_task.h>
+#include <linux/kcov.h>
+#include <linux/kprobes.h>
+#include <linux/profile.h>
+#include <linux/nmi.h>
+#include <linux/scs.h>
+
@@ -706,7 +705,7 @@ index 000000000000..572eab74418f
+#define sched_feat(x) (0)
+#endif /* CONFIG_SCHED_DEBUG */
+
+#define ALT_SCHED_VERSION "v6.1-r1"
+#define ALT_SCHED_VERSION "v6.1-r3"
+
+/* rt_prio(prio) defined in include/linux/sched/rt.h */
+#define rt_task(p) rt_prio((p)->prio)
@@ -815,14 +814,14 @@ index 000000000000..572eab74418f
+clear_recorded_preempt_mask(int pr, int low, int high, int cpu)
+{
+ if (low < pr && pr <= high)
+ cpumask_clear_cpu(cpu, sched_preempt_mask + SCHED_QUEUE_BITS - 1 - pr);
+ cpumask_clear_cpu(cpu, sched_preempt_mask + SCHED_QUEUE_BITS - pr);
+}
+
+static inline void
+set_recorded_preempt_mask(int pr, int low, int high, int cpu)
+{
+ if (low < pr && pr <= high)
+ cpumask_set_cpu(cpu, sched_preempt_mask + SCHED_QUEUE_BITS - 1 - pr);
+ cpumask_set_cpu(cpu, sched_preempt_mask + SCHED_QUEUE_BITS - pr);
+}
+
+static atomic_t sched_prio_record = ATOMIC_INIT(0);
@@ -1392,8 +1391,8 @@ index 000000000000..572eab74418f
+ * Context: rq->lock
+ */
+#define __SCHED_DEQUEUE_TASK(p, rq, flags) \
+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \
+ sched_info_dequeue(rq, p); \
+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \
+ \
+ list_del(&p->sq_node); \
+ if (list_empty(&rq->queue.heads[p->sq_idx])) \
@@ -2030,11 +2029,13 @@ index 000000000000..572eab74418f
+
+ WARN_ON_ONCE(is_migration_disabled(p));
+#endif
+ if (task_cpu(p) == new_cpu)
+ return;
+ trace_sched_migrate_task(p, new_cpu);
+ rseq_migrate(p);
+ perf_event_task_migrate(p);
+
+ if (task_cpu(p) != new_cpu)
+ {
+ rseq_migrate(p);
+ perf_event_task_migrate(p);
+ }
+
+ __set_task_cpu(p, new_cpu);
+}
@@ -5110,15 +5111,15 @@ index 000000000000..572eab74418f
+ if (src_rq->nr_running < 2)
+ cpumask_clear_cpu(i, &sched_rq_pending_mask);
+
+ spin_release(&src_rq->lock.dep_map, _RET_IP_);
+ do_raw_spin_unlock(&src_rq->lock);
+
+ rq->nr_running += nr_migrated;
+ if (rq->nr_running > 1)
+ cpumask_set_cpu(cpu, &sched_rq_pending_mask);
+
+ cpufreq_update_util(rq, 0);
+
+ spin_release(&src_rq->lock.dep_map, _RET_IP_);
+ do_raw_spin_unlock(&src_rq->lock);
+
+ return 1;
+ }
+
@@ -5147,7 +5148,7 @@ index 000000000000..572eab74418f
+}
+
+static inline struct task_struct *
+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev)
+choose_next_task(struct rq *rq, int cpu)
+{
+ struct task_struct *next;
+
@@ -5334,7 +5335,7 @@ index 000000000000..572eab74418f
+
+ check_curr(prev, rq);
+
+ next = choose_next_task(rq, cpu, prev);
+ next = choose_next_task(rq, cpu);
+ clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
@@ -5764,6 +5765,7 @@ index 000000000000..572eab74418f
+ return;
+
+ rq = __task_access_lock(p, &lock);
+ update_rq_clock(rq);
+ /*
+ * Set under pi_lock && rq->lock, such that the value can be used under
+ * either lock.
@@ -6643,6 +6645,13 @@ index 000000000000..572eab74418f
+ return retval;
+}
+
+#ifdef CONFIG_SMP
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
+{
+ return 0;
+}
+#endif
+
+static int
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+{
@@ -8027,6 +8036,14 @@ index 000000000000..572eab74418f
+
+ sched_smp_initialized = true;
+}
+
+static int __init migration_init(void)
+{
+ sched_cpu_starting(smp_processor_id());
+ return 0;
+}
+early_initcall(migration_init);
+
+#else
+void __init sched_init_smp(void)
+{
@@ -8636,14 +8653,15 @@ index 000000000000..1212a031700e
+{}
diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h
new file mode 100644
index 000000000000..e3b6320a397a
index 000000000000..c32403ed82b6
--- /dev/null
+++ b/kernel/sched/alt_sched.h
@@ -0,0 +1,667 @@
@@ -0,0 +1,668 @@
+#ifndef ALT_SCHED_H
+#define ALT_SCHED_H
+
+#include <linux/context_tracking.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/stop_machine.h>
+#include <linux/syscalls.h>
@@ -9428,23 +9446,23 @@ index d9dc9ab3773f..71a25540d65e 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -42,13 +42,19 @@
#include "idle.c"
+#ifndef CONFIG_SCHED_ALT
#include "rt.c"
+#endif
#ifdef CONFIG_SMP
+#ifndef CONFIG_SCHED_ALT
# include "cpudeadline.c"
+#endif
# include "pelt.c"
#endif
#include "cputime.c"
-#include "deadline.c"
+#ifndef CONFIG_SCHED_ALT
+#include "deadline.c"
+#endif
@@ -9453,7 +9471,7 @@ index 99bdd96f454f..23f80a86d2d7 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -85,7 +85,9 @@
#ifdef CONFIG_SMP
# include "cpupri.c"
+#ifndef CONFIG_SCHED_ALT
@@ -9461,14 +9479,14 @@ index 99bdd96f454f..23f80a86d2d7 100644
+#endif
# include "topology.c"
#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1207c78f85c1..68812e0756cb 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -159,9 +159,14 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
struct rq *rq = cpu_rq(sg_cpu->cpu);
sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
+#ifndef CONFIG_SCHED_ALT
sg_cpu->bw_dl = cpu_bw_dl(rq);
@@ -9479,7 +9497,7 @@ index 1207c78f85c1..68812e0756cb 100644
+ sg_cpu->util = rq_load_util(rq, sg_cpu->max);
+#endif /* CONFIG_SCHED_ALT */
}
/**
@@ -305,8 +310,10 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
*/
@@ -9490,11 +9508,11 @@ index 1207c78f85c1..68812e0756cb 100644
sg_cpu->sg_policy->limits_changed = true;
+#endif
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
@@ -606,6 +613,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
ret = sched_setattr_nocheck(thread, &attr);
+
if (ret) {
@@ -9509,7 +9527,7 @@ index 1207c78f85c1..68812e0756cb 100644
+#endif /* CONFIG_SCHED_ALT */
}
static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 95fc77853743..b48b3f9ed47f 100644
--- a/kernel/sched/cputime.c
@@ -9517,15 +9535,15 @@ index 95fc77853743..b48b3f9ed47f 100644
@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime)
p->utime += cputime;
account_group_user_time(p, cputime);
- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER;
/* Add user time to cpustat. */
task_group_account_field(p, index, cputime);
@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime)
p->gtime += cputime;
/* Add guest time to cpustat. */
- if (task_nice(p) > 0) {
+ if (task_running_nice(p)) {
@@ -9543,12 +9561,12 @@ index 95fc77853743..b48b3f9ed47f 100644
static u64 read_sum_exec_runtime(struct task_struct *t)
@@ -294,7 +294,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t)
struct rq *rq;
rq = task_rq_lock(t, &rf);
- ns = t->se.sum_exec_runtime;
+ ns = tsk_seruntime(t);
task_rq_unlock(rq, t, &rf);
return ns;
@@ -626,7 +626,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
@@ -9557,7 +9575,7 @@ index 95fc77853743..b48b3f9ed47f 100644
- .sum_exec_runtime = p->se.sum_exec_runtime,
+ .sum_exec_runtime = tsk_seruntime(p),
};
if (task_cputime(p, &cputime.utime, &cputime.stime))
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1637b65ba07a..033c6deeb515 100644
@@ -9566,39 +9584,39 @@ index 1637b65ba07a..033c6deeb515 100644
@@ -7,6 +7,7 @@
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*/
+#ifndef CONFIG_SCHED_ALT
/*
* This allows printing both to /proc/sched_debug and
* to the console
@@ -215,6 +216,7 @@ static const struct file_operations sched_scaling_fops = {
};
#endif /* SMP */
+#endif /* !CONFIG_SCHED_ALT */
#ifdef CONFIG_PREEMPT_DYNAMIC
@@ -278,6 +280,7 @@ static const struct file_operations sched_dynamic_fops = {
#endif /* CONFIG_PREEMPT_DYNAMIC */
+#ifndef CONFIG_SCHED_ALT
__read_mostly bool sched_debug_verbose;
static const struct seq_operations sched_debug_sops;
@@ -293,6 +296,7 @@ static const struct file_operations sched_debug_fops = {
.llseek = seq_lseek,
.release = seq_release,
};
+#endif /* !CONFIG_SCHED_ALT */
static struct dentry *debugfs_sched;
@@ -302,12 +306,15 @@ static __init int sched_init_debug(void)
debugfs_sched = debugfs_create_dir("sched", NULL);
+#ifndef CONFIG_SCHED_ALT
debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);
@@ -9606,31 +9624,31 @@ index 1637b65ba07a..033c6deeb515 100644
#ifdef CONFIG_PREEMPT_DYNAMIC
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
#endif
+#ifndef CONFIG_SCHED_ALT
debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
@@ -337,11 +344,13 @@ static __init int sched_init_debug(void)
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+#endif /* !CONFIG_SCHED_ALT */
return 0;
}
late_initcall(sched_init_debug);
+#ifndef CONFIG_SCHED_ALT
#ifdef CONFIG_SMP
static cpumask_var_t sd_sysctl_cpus;
@@ -1068,6 +1077,7 @@ void proc_sched_set_task(struct task_struct *p)
memset(&p->stats, 0, sizeof(p->stats));
#endif
}
+#endif /* !CONFIG_SCHED_ALT */
void resched_latency_warn(int cpu, u64 latency)
{
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
@@ -9640,7 +9658,7 @@ index f26ab2675f7d..480d4ad16d45 100644
@@ -400,6 +400,7 @@ void cpu_startup_entry(enum cpuhp_state state)
do_idle();
}
+#ifndef CONFIG_SCHED_ALT
/*
* idle-task scheduling class.
@@ -9790,17 +9808,17 @@ index 0f310768260c..bd38bf738fe9 100644
@@ -266,6 +266,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}
+#ifndef CONFIG_SCHED_ALT
/*
* sched_entity:
*
@@ -383,8 +384,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
+#endif
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT)
/*
@@ -9813,7 +9831,7 @@ index 3a0e0dc28721..e8a7d84aa5a5 100644
@@ -1,13 +1,15 @@
#ifdef CONFIG_SMP
#include "sched-pelt.h"
+#ifndef CONFIG_SCHED_ALT
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
@@ -9821,16 +9839,16 @@ index 3a0e0dc28721..e8a7d84aa5a5 100644
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+#endif
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT)
int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
static inline u64 thermal_load_avg(struct rq *rq)
@@ -44,6 +46,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
return PELT_MIN_DIVIDER + avg->period_contrib;
}
+#ifndef CONFIG_SCHED_ALT
static inline void cfs_se_util_change(struct sched_avg *avg)
{
@@ -9840,9 +9858,9 @@ index 3a0e0dc28721..e8a7d84aa5a5 100644
}
#endif
+#endif /* CONFIG_SCHED_ALT */
#else
+#ifndef CONFIG_SCHED_ALT
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
@@ -9852,7 +9870,7 @@ index 3a0e0dc28721..e8a7d84aa5a5 100644
return 0;
}
+#endif
static inline int
update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
@@ -9862,7 +9880,7 @@ index a4a20046e586..c363693cd869 100644
@@ -5,6 +5,10 @@
#ifndef _KERNEL_SCHED_SCHED_H
#define _KERNEL_SCHED_SCHED_H
+#ifdef CONFIG_SCHED_ALT
+#include "alt_sched.h"
+#else
@@ -9873,7 +9891,7 @@ index a4a20046e586..c363693cd869 100644
@@ -3183,4 +3187,9 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
cgroup_account_cputime(curr, delta_exec);
}
+static inline int task_running_nice(struct task_struct *p)
+{
+ return (task_nice(p) > 0);
@@ -9897,7 +9915,7 @@ index 857f837f52cb..5486c63e4790 100644
rq = cpu_rq(cpu);
@@ -143,6 +145,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
#ifdef CONFIG_SMP
+#ifndef CONFIG_SCHED_ALT
/* domain-specific stats */
@@ -9916,9 +9934,9 @@ index 84a188913cc9..53934e7ef5db 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -89,6 +89,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
#endif /* CONFIG_SCHEDSTATS */
+#ifndef CONFIG_SCHED_ALT
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity_stats {
@@ -9928,7 +9946,7 @@ index 84a188913cc9..53934e7ef5db 100644
return &task_of(se)->stats;
}
+#endif /* CONFIG_SCHED_ALT */
#ifdef CONFIG_PSI
void psi_task_change(struct task_struct *task, int clear, int set);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
@@ -9938,37 +9956,37 @@ index 8739c2a5a54e..d8dd6c15eb47 100644
@@ -3,6 +3,7 @@
* Scheduler topology setup/handling methods
*/
+#ifndef CONFIG_SCHED_ALT
DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */
@@ -1413,8 +1414,10 @@ static void asym_cpu_capacity_scan(void)
*/
static int default_relax_domain_level = -1;
+#endif /* CONFIG_SCHED_ALT */
int sched_domain_level_max;
+#ifndef CONFIG_SCHED_ALT
static int __init setup_relax_domain_level(char *str)
{
if (kstrtoint(str, 0, &default_relax_domain_level))
@@ -1647,6 +1650,7 @@ sd_init(struct sched_domain_topology_level *tl,
return sd;
}
+#endif /* CONFIG_SCHED_ALT */
/*
* Topology list, bottom-up.
@@ -1683,6 +1687,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl)
sched_domain_topology_saved = NULL;
}
+#ifndef CONFIG_SCHED_ALT
#ifdef CONFIG_NUMA
static const struct cpumask *sd_numa_mask(int cpu)
@@ -2645,3 +2650,15 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
@@ -9991,9 +10009,9 @@ index c6d9dec11b74..2bc42ce8b48e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -93,6 +93,10 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
/* Constants used for minimum and maximum */
+#ifdef CONFIG_SCHED_ALT
+extern int sched_yield_type;
+#endif
@@ -10003,7 +10021,7 @@ index c6d9dec11b74..2bc42ce8b48e 100644
#endif
@@ -1633,6 +1637,7 @@ int proc_do_static_key(struct ctl_table *table, int write,
}
static struct ctl_table kern_table[] = {
+#ifndef CONFIG_SCHED_ALT
#ifdef CONFIG_NUMA_BALANCING
@@ -10042,13 +10060,13 @@ index 3ae661ab6260..35f0176dcdb0 100644
@@ -2088,8 +2088,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
int ret = 0;
u64 slack;
+#ifndef CONFIG_SCHED_ALT
slack = current->timer_slack_ns;
if (dl_task(current) || rt_task(current))
+#endif
slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index cb925e8ef9a8..67d823510f5c 100644
@@ -10056,17 +10074,17 @@ index cb925e8ef9a8..67d823510f5c 100644
+++ b/kernel/time/posix-cpu-timers.c
@@ -223,7 +223,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples)
u64 stime, utime;
task_cputime(p, &utime, &stime);
- store_samples(samples, stime, utime, p->se.sum_exec_runtime);
+ store_samples(samples, stime, utime, tsk_seruntime(p));
}
static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
@@ -866,6 +866,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples,
}
}
+#ifndef CONFIG_SCHED_ALT
static inline void check_dl_overrun(struct task_struct *tsk)
{
@@ -10076,18 +10094,18 @@ index cb925e8ef9a8..67d823510f5c 100644
}
}
+#endif
static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
{
@@ -900,8 +902,10 @@ static void check_thread_timers(struct task_struct *tsk,
u64 samples[CPUCLOCK_MAX];
unsigned long soft;
+#ifndef CONFIG_SCHED_ALT
if (dl_task(tsk))
check_dl_overrun(tsk);
+#endif
if (expiry_cache_is_inactive(pct))
return;
@@ -915,7 +919,7 @@ static void check_thread_timers(struct task_struct *tsk,
@@ -10097,17 +10115,17 @@ index cb925e8ef9a8..67d823510f5c 100644
- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);
+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ);
unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
/* At the hard limit, send SIGKILL. No further action. */
@@ -1151,8 +1155,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
return true;
}
+#ifndef CONFIG_SCHED_ALT
if (dl_task(tsk) && tsk->dl.dl_overrun)
return true;
+#endif
return false;
}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
@@ -10129,4 +10147,4 @@ index a2d301f58ced..2ccdede8585c 100644
+#endif
};
struct wakeup_test_data *x = data;

View File

@@ -2540,41 +2540,3 @@ index 41fd8352ab6561..1d2f77835de5f0 100644
/**
From 07fa6df737871f5e491ec70f499963aedd679f2a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 11 Jan 2023 17:07:33 +0100
Subject: [PATCH] netfilter: nft_payload: incorrect arithmetics when fetching
VLAN header bits
commit 696e1a48b1a1b01edad542a1ef293665864a4dd0 upstream.
If the offset + length goes over the ethernet + vlan header, then the
length is adjusted to copy the bytes that are within the boundaries of
the vlan_ethhdr scratchpad area. The remaining bytes beyond ethernet +
vlan header are copied directly from the skbuff data area.
Fix incorrect arithmetic operator: subtract, not add, the size of the
vlan header in case of double-tagged packets to adjust the length
accordingly to address CVE-2023-0179.
Reported-by: Davide Ornaghi <d.ornaghi97@gmail.com>
Fixes: f6ae9f120dad ("netfilter: nft_payload: add C-VLAN support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
net/netfilter/nft_payload.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 4edd899aeb9bb5..d7de2ecb287eba 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -62,7 +62,7 @@ nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
return false;
if (offset + len > VLAN_ETH_HLEN + vlan_hlen)
- ethlen -= offset + len - VLAN_ETH_HLEN + vlan_hlen;
+ ethlen -= offset + len - VLAN_ETH_HLEN - vlan_hlen;
memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen);

View File

@@ -0,0 +1,90 @@
From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001
From: Tk-Glitch <ti3nou@gmail.com>
Date: Wed, 4 Jul 2018 04:30:08 +0200
Subject: glitched - PDS
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..1d9c7ed79b11 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -4,7 +4,7 @@
choice
prompt "Timer frequency"
- default HZ_250
+ default HZ_500
help
Allows the configuration of the timer frequency. It is customary
to have the timer interrupt run at 1000 Hz but 100 Hz may be more
@@ -39,6 +39,13 @@ choice
on SMP and NUMA systems and exactly dividing by both PAL and
NTSC frame rates for video and multimedia work.
+ config HZ_500
+ bool "500 HZ"
+ help
+ 500 Hz is a balanced timer frequency. Provides fast interactivity
+ on desktops with great smoothness without increasing CPU power
+ consumption and sacrificing the battery life on laptops.
+
config HZ_1000
bool "1000 HZ"
help
@@ -52,6 +59,7 @@ config HZ
default 100 if HZ_100
default 250 if HZ_250
default 300 if HZ_300
+ default 500 if HZ_500
default 1000 if HZ_1000
config SCHED_HRTICK
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..1d9c7ed79b11 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -4,7 +4,7 @@
choice
prompt "Timer frequency"
- default HZ_500
+ default HZ_750
help
Allows the configuration of the timer frequency. It is customary
to have the timer interrupt run at 1000 Hz but 100 Hz may be more
@@ -46,6 +46,13 @@ choice
on desktops with great smoothness without increasing CPU power
consumption and sacrificing the battery life on laptops.
+ config HZ_750
+ bool "750 HZ"
+ help
+ 750 Hz is a good timer frequency for desktops. Provides fast
+ interactivity with great smoothness without sacrificing too
+ much throughput.
+
config HZ_1000
bool "1000 HZ"
help
@@ -60,6 +67,7 @@ config HZ
default 250 if HZ_250
default 300 if HZ_300
default 500 if HZ_500
+ default 750 if HZ_750
default 1000 if HZ_1000
config SCHED_HRTICK
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9270a4370d54..30d01e647417 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -169,7 +169,7 @@
/*
* From 0 .. 200. Higher means more swappy.
*/
-int vm_swappiness = 60;
+int vm_swappiness = 20;
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)

File diff suppressed because it is too large Load Diff

View File

@@ -64,253 +64,3 @@ index 2c7171e0b0010..85de313ddec29 100644
select CPU_FREQ_GOV_PERFORMANCE
help
From 0c079d3f88df5f8286cd5c91b54bdac7c819be85 Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Tue, 6 Dec 2022 16:11:41 +0000
Subject: [PATCH] drm/i915: improve the catch-all evict to handle lock
contention
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The catch-all evict can fail due to object lock contention, since it
only goes as far as trylocking the object, due to us already holding the
vm->mutex. Doing a full object lock here can deadlock, since the
vm->mutex is always our inner lock. Add another execbuf pass which drops
the vm->mutex and then tries to grab the object will the full lock,
before then retrying the eviction. This should be good enough for now to
fix the immediate regression with userspace seeing -ENOSPC from execbuf
due to contended object locks during GTT eviction.
Testcase: igt@gem_ppgtt@shrink-vs-evict-*
Fixes: 7e00897be8bf ("drm/i915: Add object locking to i915_gem_evict_for_node and i915_gem_evict_something, v2.")
References: https://gitlab.freedesktop.org/drm/intel/-/issues/7627
References: https://gitlab.freedesktop.org/drm/intel/-/issues/7570
References: https://bugzilla.mozilla.org/show_bug.cgi?id=1779558
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Andrzej Hajda <andrzej.hajda@intel.com>
Cc: Mani Milani <mani@chromium.org>
Cc: <stable@vger.kernel.org> # v5.18+
Revision 1 of https://patchwork.freedesktop.org/series/111686/
---
.../gpu/drm/i915/gem/i915_gem_execbuffer.c | 25 +++++++++++--
drivers/gpu/drm/i915/gem/i915_gem_mman.c | 2 +-
drivers/gpu/drm/i915/i915_gem_evict.c | 37 ++++++++++++++-----
drivers/gpu/drm/i915/i915_gem_evict.h | 4 +-
drivers/gpu/drm/i915/i915_vma.c | 2 +-
.../gpu/drm/i915/selftests/i915_gem_evict.c | 4 +-
6 files changed, 56 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 845023c14eb36f..094e92ed28db4f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -741,25 +741,44 @@ static int eb_reserve(struct i915_execbuffer *eb)
*
* Defragmenting is skipped if all objects are pinned at a fixed location.
*/
- for (pass = 0; pass <= 2; pass++) {
+ for (pass = 0; pass <= 3; pass++) {
int pin_flags = PIN_USER | PIN_VALIDATE;
if (pass == 0)
pin_flags |= PIN_NONBLOCK;
if (pass >= 1)
- unpinned = eb_unbind(eb, pass == 2);
+ unpinned = eb_unbind(eb, pass >= 2);
if (pass == 2) {
err = mutex_lock_interruptible(&eb->context->vm->mutex);
if (!err) {
- err = i915_gem_evict_vm(eb->context->vm, &eb->ww);
+ err = i915_gem_evict_vm(eb->context->vm, &eb->ww, NULL);
mutex_unlock(&eb->context->vm->mutex);
}
if (err)
return err;
}
+ if (pass == 3) {
+retry:
+ err = mutex_lock_interruptible(&eb->context->vm->mutex);
+ if (!err) {
+ struct drm_i915_gem_object *busy_bo = NULL;
+
+ err = i915_gem_evict_vm(eb->context->vm, &eb->ww, &busy_bo);
+ mutex_unlock(&eb->context->vm->mutex);
+ if (err && busy_bo) {
+ err = i915_gem_object_lock(busy_bo, &eb->ww);
+ i915_gem_object_put(busy_bo);
+ if (!err)
+ goto retry;
+ }
+ }
+ if (err)
+ return err;
+ }
+
list_for_each_entry(ev, &eb->unbound, bind_link) {
err = eb_reserve_vma(eb, ev, pin_flags);
if (err)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index 73d9eda1d6b7a6..c83d98e1dc5da0 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -369,7 +369,7 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
if (vma == ERR_PTR(-ENOSPC)) {
ret = mutex_lock_interruptible(&ggtt->vm.mutex);
if (!ret) {
- ret = i915_gem_evict_vm(&ggtt->vm, &ww);
+ ret = i915_gem_evict_vm(&ggtt->vm, &ww, NULL);
mutex_unlock(&ggtt->vm.mutex);
}
if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index f025ee4fa52618..a4b4d9b7d26c7a 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -416,6 +416,11 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
* @vm: Address space to cleanse
* @ww: An optional struct i915_gem_ww_ctx. If not NULL, i915_gem_evict_vm
* will be able to evict vma's locked by the ww as well.
+ * @busy_bo: Optional pointer to struct drm_i915_gem_object. If not NULL, then
+ * in the event i915_gem_evict_vm() is unable to trylock an object for eviction,
+ * then @busy_bo will point to it. -EBUSY is also returned. The caller must drop
+ * the vm->mutex, before trying again to acquire the contended lock. The caller
+ * also owns a reference to the object.
*
* This function evicts all vmas from a vm.
*
@@ -425,7 +430,8 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
* To clarify: This is for freeing up virtual address space, not for freeing
* memory in e.g. the shrinker.
*/
-int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww)
+int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww,
+ struct drm_i915_gem_object **busy_bo)
{
int ret = 0;
@@ -457,15 +463,22 @@ int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww)
* the resv is shared among multiple objects, we still
* need the object ref.
*/
- if (dying_vma(vma) ||
+ if (!i915_gem_object_get_rcu(vma->obj) ||
(ww && (dma_resv_locking_ctx(vma->obj->base.resv) == &ww->ctx))) {
__i915_vma_pin(vma);
list_add(&vma->evict_link, &locked_eviction_list);
continue;
}
- if (!i915_gem_object_trylock(vma->obj, ww))
+ if (!i915_gem_object_trylock(vma->obj, ww)) {
+ if (busy_bo) {
+ *busy_bo = vma->obj; /* holds ref */
+ ret = -EBUSY;
+ break;
+ }
+ i915_gem_object_put(vma->obj);
continue;
+ }
__i915_vma_pin(vma);
list_add(&vma->evict_link, &eviction_list);
@@ -473,25 +486,29 @@ int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww)
if (list_empty(&eviction_list) && list_empty(&locked_eviction_list))
break;
- ret = 0;
/* Unbind locked objects first, before unlocking the eviction_list */
list_for_each_entry_safe(vma, vn, &locked_eviction_list, evict_link) {
__i915_vma_unpin(vma);
- if (ret == 0)
+ if (ret == 0) {
ret = __i915_vma_unbind(vma);
- if (ret != -EINTR) /* "Get me out of here!" */
- ret = 0;
+ if (ret != -EINTR) /* "Get me out of here!" */
+ ret = 0;
+ }
+ if (!dying_vma(vma))
+ i915_gem_object_put(vma->obj);
}
list_for_each_entry_safe(vma, vn, &eviction_list, evict_link) {
__i915_vma_unpin(vma);
- if (ret == 0)
+ if (ret == 0) {
ret = __i915_vma_unbind(vma);
- if (ret != -EINTR) /* "Get me out of here!" */
- ret = 0;
+ if (ret != -EINTR) /* "Get me out of here!" */
+ ret = 0;
+ }
i915_gem_object_unlock(vma->obj);
+ i915_gem_object_put(vma->obj);
}
} while (ret == 0);
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.h b/drivers/gpu/drm/i915/i915_gem_evict.h
index e593c530f9bd7a..bf0ee0e4fe6088 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.h
+++ b/drivers/gpu/drm/i915/i915_gem_evict.h
@@ -11,6 +11,7 @@
struct drm_mm_node;
struct i915_address_space;
struct i915_gem_ww_ctx;
+struct drm_i915_gem_object;
int __must_check i915_gem_evict_something(struct i915_address_space *vm,
struct i915_gem_ww_ctx *ww,
@@ -23,6 +24,7 @@ int __must_check i915_gem_evict_for_node(struct i915_address_space *vm,
struct drm_mm_node *node,
unsigned int flags);
int i915_gem_evict_vm(struct i915_address_space *vm,
- struct i915_gem_ww_ctx *ww);
+ struct i915_gem_ww_ctx *ww,
+ struct drm_i915_gem_object **busy_bo);
#endif /* __I915_GEM_EVICT_H__ */
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index f17c09ead7d778..4d06875de14a14 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -1569,7 +1569,7 @@ static int __i915_ggtt_pin(struct i915_vma *vma, struct i915_gem_ww_ctx *ww,
* locked objects when called from execbuf when pinning
* is removed. This would probably regress badly.
*/
- i915_gem_evict_vm(vm, NULL);
+ i915_gem_evict_vm(vm, NULL, NULL);
mutex_unlock(&vm->mutex);
}
} while (1);
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
index 8c6517d29b8e0c..37068542aafe7f 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
@@ -344,7 +344,7 @@ static int igt_evict_vm(void *arg)
/* Everything is pinned, nothing should happen */
mutex_lock(&ggtt->vm.mutex);
- err = i915_gem_evict_vm(&ggtt->vm, NULL);
+ err = i915_gem_evict_vm(&ggtt->vm, NULL, NULL);
mutex_unlock(&ggtt->vm.mutex);
if (err) {
pr_err("i915_gem_evict_vm on a full GGTT returned err=%d]\n",
@@ -356,7 +356,7 @@ static int igt_evict_vm(void *arg)
for_i915_gem_ww(&ww, err, false) {
mutex_lock(&ggtt->vm.mutex);
- err = i915_gem_evict_vm(&ggtt->vm, &ww);
+ err = i915_gem_evict_vm(&ggtt->vm, &ww, NULL);
mutex_unlock(&ggtt->vm.mutex);
}