PDS Kernel Configuration

linux 6.0.y: misc-additions: Remove mm: vmscan: fix extreme overreclaim and swap floods patch.
Merged upstream.
2022-12-03 06:21:45 -05:00 · 2022-12-03 03:05:49 +01:00 · 2022-12-01 15:19:32 +01:00 · 2022-11-24 17:10:45 +01:00 · 2022-11-24 15:42:21 +01:00 · 2022-11-23 15:13:07 +01:00
6 changed files with 127 additions and 316 deletions
--- a/3
+++ b/3
@@ -31,7 +31,6 @@ _distro="Arch"
 declare -p -x > current_env
 source "$_where"/customization.cfg # load default configuration from file
 source "$_where"/linux-tkg-config/prepare
 if [ -e "$_EXT_CONFIG_PATH" ]; then
  msg2 "External configuration file $_EXT_CONFIG_PATH will be used and will override customization.cfg values."
@@ -40,6 +39,8 @@ fi
 source current_env
 source "$_where"/linux-tkg-config/prepare
 # Make sure we're in a clean state
 if [ ! -e "$_where"/BIG_UGLY_FROGMINER ]; then
  _tkg_initscript
--- a/customization.cfg
+++ b/customization.cfg
@@ -3,7 +3,7 @@
 # Linux distribution you are using, options are "Arch", "Void", "Ubuntu", "Debian", "Fedora", "Suse", "Gentoo", "Generic".
 # It is automatically set to "Arch" when using PKGBUILD.
 # If left empty, the script will prompt
-_distro=""
+_distro="Arch"
 # Kernel Version - Options are "5.4", and from "5.7" to "5.19"
 # you can also set a specific kernel version, e.g. "6.0-rc4" or "5.10.51",
@@ -32,7 +32,7 @@ CUSTOM_GCC_PATH=""
 CUSTOM_LLVM_PATH=""
 # Set to true to bypass makepkg.conf and use all available threads for compilation. False will respect your makepkg.conf options.
-_force_all_threads="true"
+_force_all_threads="false"
 # Set to true to prevent ccache from being used and set CONFIG_GCC_PLUGINS=y (which needs to be disabled for ccache to work properly)
 _noccache="false"
@@ -46,10 +46,10 @@ _modprobeddb="false"
 _modprobeddb_db_path=~/.config/modprobed.db
 # Set to "1" to call make menuconfig, "2" to call make nconfig, "3" to call make xconfig, before building the kernel. Set to false to disable and skip the prompt.
-_menunconfig=""
+_menunconfig="false"
 # Set to true to generate a kernel config fragment from your changes in menuconfig/nconfig. Set to false to disable and skip the prompt.
-_diffconfig=""
+_diffconfig="false"
 # Set to the file name where the generated config fragment should be written to. Only used if _diffconfig is active.
 _diffconfig_name=""
@@ -90,11 +90,11 @@ _STRIP="true"
 # LEAVE AN EMPTY VALUE TO BE PROMPTED ABOUT FOLLOWING OPTIONS AT BUILD TIME
 # CPU scheduler - Options are "upds" (TkG's Undead PDS), "pds", "bmq", "muqss", "cacule" or "cfs" (kernel's default)
-_cpusched=""
+_cpusched="pds"
 # Compiler to use - Options are "gcc" or "llvm".
 # For advanced users.
-_compiler=""
+_compiler="gcc"
 # Force the use of the LLVM Integrated Assembler whether using LLVM, LTO or not.
 # Set to "1" to enable.
@@ -124,7 +124,7 @@ _preempt_rt_force=""
 # For BMQ:           0: No yield.
 #                    1: Deboost and requeue task. (Default)
 #                    2: Set rq skip task.
-_sched_yield_type=""
+_sched_yield_type="0"
 # Round Robin interval is the longest duration two tasks with the same nice level will be delayed for. When CPU time is requested by a task, it receives a time slice equal
 # to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low value can help offset the disadvantages of rescheduling a process that has yielded.
@@ -132,7 +132,7 @@ _sched_yield_type=""
 # PDS default: 4ms"
 # BMQ default: 2ms"
 # Set to "1" for 2ms, "2" for 4ms, "3" for 6ms, "4" for 8ms, or "default" to keep the chosen scheduler defaults.
-_rr_interval=""
+_rr_interval="2"
 # Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false"
 _ftracedisable="false"
@@ -147,10 +147,10 @@ _misc_adds="true"
 # Full tickless can give higher performances in case you use isolation of CPUs for tasks
 # and it works only when using the nohz_full kernel parameter, otherwise behaves like idle.
 # Just tickless idle perform better for most platforms.
-_tickless=""
+_tickless="2"
 # Set to "true" to use ACS override patch - https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29 - Kernel default is "false"
-_acs_override=""
+_acs_override="false"
 # Set to "true" to add Bcache filesystem support. You'll have to install bcachefs-tools-git from AUR for utilities - https://bcachefs.org/ - If in doubt, set to "false"
 # This can be buggy and isn't recommended on a production machine, also enabling this option will not allow you to enable MGLRU.
@@ -179,13 +179,13 @@ _futex_waitv="false"
 _winesync="false"
 # Set to "true" to enable Binder and Ashmem, the kernel modules required to use the android emulator Anbox. ! This doesn't apply to 5.4.y !
-_anbox=""
+_anbox="false"
 # A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience (ZENIFY) - Default is "true"
 _zenify="true"
 # compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "1"
-_compileroptlevel="1"
+_compileroptlevel="2"
 # CPU compiler optimizations - Defaults to prompt at kernel config if left empty
 # AMD CPUs : "k8" "k8sse3" "k10" "barcelona" "bobcat" "jaguar" "bulldozer" "piledriver" "steamroller" "excavator" "zen" "zen2" "zen3" (zen3 opt support depends on GCC11)
@@ -199,7 +199,7 @@ _compileroptlevel="1"
 # - "generic_v2" (depends on GCC11 - to share the package between machines with different CPU µarch supporting at least x86-64-v2
 # - "generic_v3" (depends on GCC11 - to share the package between machines with different CPU µarch supporting at least x86-64-v3
 # - "generic_v4" (depends on GCC11 - to share the package between machines with different CPU µarch supporting at least x86-64-v4
-_processor_opt=""
+_processor_opt="skylake"
 # MuQSS only - Make IRQ threading compulsory (FORCE_IRQ_THREADING) - Default is "false"
 _irq_threading="false"
@@ -215,17 +215,17 @@ _cacule_rdb_interval="19"
 _tt_high_hz="false"
 # MuQSS and PDS only - SMT (Hyperthreading) aware nice priority and policy support (SMT_NICE) - Kernel default is "true" - You can disable this on non-SMT/HT CPUs for lower overhead
-_smt_nice=""
+_smt_nice="true"
 # Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false"
-_random_trust_cpu="false"
+_random_trust_cpu="true"
 # MuQSS only - CPU scheduler runqueue sharing - No sharing (RQ_NONE), SMT (hyperthread) siblings (RQ_SMT), Multicore siblings (RQ_MC), Symmetric Multi-Processing (RQ_SMP), NUMA (RQ_ALL)
 # Valid values are "none", "smt", "mc", "mc-llc"(for zen), "smp", "all" - Kernel default is "smt"
 _runqueue_sharing=""
 # Timer frequency - "100" "250" "300" "500" "750" "1000" ("2000" is available for cacule cpusched only) - More options available in kernel config prompt when left empty depending on selected cpusched with the default option pointed with a ">" (2000 for cacule, 100 for muqss and 1000 for other cpu schedulers)
-_timer_freq=""
+_timer_freq="500"
 # Default CPU governor - "performance", "ondemand", "schedutil" or leave empty for default (schedutil)
 _default_cpu_gov="ondemand"
--- a/install.sh
+++ b/install.sh
@@ -44,6 +44,12 @@ plain() {
 declare -p -x > current_env
 source customization.cfg
 if [ -e "$_EXT_CONFIG_PATH" ]; then
  msg2 "External configuration file $_EXT_CONFIG_PATH will be used and will override customization.cfg values."
  source "$_EXT_CONFIG_PATH"
 fi
 . current_env
 source linux-tkg-config/prepare
--- a/linux-tkg-patches/6.0/0009-prjc_v6.0-r0.patch
+++ b/linux-tkg-patches/6.0/0009-prjc_v6.0-r0.patch
@@ -1,6 +1,6 @@
-From b53bf730e6bba71ebc0ec8452cc2ca399137090e Mon Sep 17 00:00:00 2001
+From 711a56e8f6314d77141b0f661e6c13c8a2c4dddf Mon Sep 17 00:00:00 2001
 From: Tor Vic <torvic9@mailbox.org>
-Date: Mon, 3 Oct 2022 11:19:50 +0200
+Date: Wed, 16 Nov 2022 11:29:00 +0100
 Subject: [PATCH] Project-C 6.0-rc0-vd
 ---
@@ -22,7 +22,7 @@ Subject: [PATCH] Project-C 6.0-rc0-vd
 kernel/exit.c                                 |    4 +-
 kernel/locking/rtmutex.c                      |   16 +-
 kernel/sched/Makefile                         |    5 +
- kernel/sched/alt_core.c                       | 7937 +++++++++++++++++
+ kernel/sched/alt_core.c                       | 7959 +++++++++++++++++
 kernel/sched/alt_debug.c                      |   31 +
 kernel/sched/alt_sched.h                      |  645 ++
 kernel/sched/bmq.h                            |  110 +
@@ -43,7 +43,7 @@ Subject: [PATCH] Project-C 6.0-rc0-vd
 kernel/time/hrtimer.c                         |    2 +
 kernel/time/posix-cpu-timers.c                |   10 +-
 kernel/trace/trace_selftest.c                 |    5 +
- 39 files changed, 9245 insertions(+), 23 deletions(-)
+ 39 files changed, 9267 insertions(+), 23 deletions(-)
 create mode 100644 Documentation/scheduler/sched-BMQ.txt
 create mode 100644 kernel/sched/alt_core.c
 create mode 100644 kernel/sched/alt_debug.c
@@ -685,10 +685,10 @@ index 976092b7bd45..31d587c16ec1 100644
 obj-y += build_utility.o
 diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c
 new file mode 100644
-index 000000000000..f3bac14124c3
+index 000000000000..18dfee354f9b
 --- /dev/null
 +++ b/kernel/sched/alt_core.c
-@@ -0,0 +1,7937 @@
+@@ -0,0 +1,7959 @@
 +/*
 + *  kernel/sched/alt_core.c
 + *
@@ -3111,13 +3111,6 @@ index 000000000000..f3bac14124c3
 +	if (!llist)
 +		return;
 +
 +	/*
 +	 * rq::ttwu_pending racy indication of out-standing wakeups.
 +	 * Races such that false-negatives are possible, since they
 +	 * are shorter lived that false-positives would be.
 +	 */
 +	WRITE_ONCE(rq->ttwu_pending, 0);
 +
 +	rq_lock_irqsave(rq, &rf);
 +	update_rq_clock(rq);
 +
@@ -3131,6 +3124,17 @@ index 000000000000..f3bac14124c3
 +		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0);
 +	}
 +
 +	/*
 +	 * Must be after enqueueing at least once task such that
 +	 * idle_cpu() does not observe a false-negative -- if it does,
 +	 * it is possible for select_idle_siblings() to stack a number
 +	 * of tasks on this CPU during that window.
 +	 *
 +	 * It is ok to clear ttwu_pending when another task pending.
 +	 * We will receive IPI after local irq enabled and then enqueue it.
 +	 * Since now nr_running > 0, idle_cpu() will always get correct result.
 +	 */
 +	WRITE_ONCE(rq->ttwu_pending, 0);
 +	rq_unlock_irqrestore(rq, &rf);
 +}
 +
@@ -3601,6 +3605,40 @@ index 000000000000..f3bac14124c3
 +	return success;
 +}
 +
 +static bool __task_needs_rq_lock(struct task_struct *p)
 +{
 +	unsigned int state = READ_ONCE(p->__state);
 +
 +	/*
 +	 * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
 +	 * the task is blocked. Make sure to check @state since ttwu() can drop
 +	 * locks at the end, see ttwu_queue_wakelist().
 +	 */
 +	if (state == TASK_RUNNING || state == TASK_WAKING)
 +		return true;
 +
 +	/*
 +	 * Ensure we load p->on_rq after p->__state, otherwise it would be
 +	 * possible to, falsely, observe p->on_rq == 0.
 +	 *
 +	 * See try_to_wake_up() for a longer comment.
 +	 */
 +	smp_rmb();
 +	if (p->on_rq)
 +		return true;
 +
 +#ifdef CONFIG_SMP
 +	/*
 +	 * Ensure the task has finished __schedule() and will not be referenced
 +	 * anymore. Again, see try_to_wake_up() for a longer comment.
 +	 */
 +	smp_rmb();
 +	smp_cond_load_acquire(&p->on_cpu, !VAL);
 +#endif
 +
 +	return false;
 +}
 +
 +/**
 + * task_call_func - Invoke a function on task in fixed state
 + * @p: Process for which the function is to be invoked, can be @current.
@@ -3618,28 +3656,12 @@ index 000000000000..f3bac14124c3
 +int task_call_func(struct task_struct *p, task_call_f func, void *arg)
 +{
 +	struct rq *rq = NULL;
 +	unsigned int state;
 +	struct rq_flags rf;
 +	int ret;
 +
 +	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
 +
-+	state = READ_ONCE(p->__state);
+	if (__task_needs_rq_lock(p))
 +
 +	/*
 +	 * Ensure we load p->on_rq after p->__state, otherwise it would be
 +	 * possible to, falsely, observe p->on_rq == 0.
 +	 *
 +	 * See try_to_wake_up() for a longer comment.
 +	 */
 +	smp_rmb();
 +
 +	/*
 +	 * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
 +	 * the task is blocked. Make sure to check @state since ttwu() can drop
 +	 * locks at the end, see ttwu_queue_wakelist().
 +	 */
 +	if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq)
 +		rq = __task_rq_lock(p, &rf);
 +
 +	/*
@@ -10130,5 +10152,55 @@ index a2d301f58ced..2ccdede8585c 100644
 	struct wakeup_test_data *x = data;
 --
-2.37.3
+2.38.1
 diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c
 index f3bac14124c3..27eafbccf23d 100644
 --- a/kernel/sched/alt_core.c
 +++ b/kernel/sched/alt_core.c
@@ -1448,11 +1448,13 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	WARN_ON_ONCE(is_migration_disabled(p));
 #endif
 -	if (task_cpu(p) == new_cpu)
 -		return;
 +
 	trace_sched_migrate_task(p, new_cpu);
 -	rseq_migrate(p);
 -	perf_event_task_migrate(p);
 +
 +	if (task_cpu(p) != new_cpu) {
 +		rseq_migrate(p);
 +		perf_event_task_migrate(p);
 +	}
 	__set_task_cpu(p, new_cpu);
 }
 diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c
 index f3bac14124c3..5678c247c0ab 100644
 --- a/kernel/sched/alt_core.c
 +++ b/kernel/sched/alt_core.c
@@ -810,8 +810,8 @@ unsigned long get_wchan(struct task_struct *p)
  * Context: rq->lock
  */
 #define __SCHED_DEQUEUE_TASK(p, rq, flags)					\
 -	psi_dequeue(p, flags & DEQUEUE_SLEEP);					\
 	sched_info_dequeue(rq, p);						\
 +	psi_dequeue(p, flags & DEQUEUE_SLEEP);					\
 										\
 	list_del(&p->sq_node);							\
 	if (list_empty(&rq->queue.heads[p->sq_idx])) 				\
 diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c
 index f3bac14124c3..349a2c92d534 100644
 --- a/kernel/sched/alt_core.c
 +++ b/kernel/sched/alt_core.c
@@ -4404,8 +4404,8 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
 /*
  * Compile time debug macro
 - * #define ALT_SCHED_DEBUG
  */
 +#define ALT_SCHED_DEBUG
 #ifdef ALT_SCHED_DEBUG
 void alt_sched_debug(void)
--- a/linux-tkg-patches/6.0/0012-misc-additions.patch
+++ b/linux-tkg-patches/6.0/0012-misc-additions.patch
@@ -64,140 +64,6 @@ index 2c7171e0b0010..85de313ddec29 100644
 	select CPU_FREQ_GOV_PERFORMANCE
 	help
 From 2535fbde890f14c78b750139fcf87d1143850626 Mon Sep 17 00:00:00 2001
 From: Johannes Weiner <hannes@cmpxchg.org>
 Date: Tue, 2 Aug 2022 12:28:11 -0400
 Subject: [PATCH] mm: vmscan: fix extreme overreclaim and swap floods
 During proactive reclaim, we sometimes observe severe overreclaim, with
 several thousand times more pages reclaimed than requested.
 This trace was obtained from shrink_lruvec() during such an instance:
    prio:0 anon_cost:1141521 file_cost:7767
    nr_reclaimed:4387406 nr_to_reclaim:1047 (or_factor:4190)
    nr=[7161123 345 578 1111]
 While he reclaimer requested 4M, vmscan reclaimed close to 16G, most of it
 by swapping.  These requests take over a minute, during which the write()
 to memory.reclaim is unkillably stuck inside the kernel.
 Digging into the source, this is caused by the proportional reclaim
 bailout logic.  This code tries to resolve a fundamental conflict: to
 reclaim roughly what was requested, while also aging all LRUs fairly and
 in accordance to their size, swappiness, refault rates etc.  The way it
 attempts fairness is that once the reclaim goal has been reached, it stops
 scanning the LRUs with the smaller remaining scan targets, and adjusts the
 remainder of the bigger LRUs according to how much of the smaller LRUs was
 scanned.  It then finishes scanning that remainder regardless of the
 reclaim goal.
 This works fine if priority levels are low and the LRU lists are
 comparable in size.  However, in this instance, the cgroup that is
 targeted by proactive reclaim has almost no files left - they've already
 been squeezed out by proactive reclaim earlier - and the remaining anon
 pages are hot.  Anon rotations cause the priority level to drop to 0,
 which results in reclaim targeting all of anon (a lot) and all of file
 (almost nothing).  By the time reclaim decides to bail, it has scanned
 most or all of the file target, and therefor must also scan most or all of
 the enormous anon target.  This target is thousands of times larger than
 the reclaim goal, thus causing the overreclaim.
 The bailout code hasn't changed in years, why is this failing now?  The
 most likely explanations are two other recent changes in anon reclaim:
 1. Before the series starting with commit 5df741963d52 ("mm: fix LRU
   balancing effect of new transparent huge pages"), the VM was
   overall relatively reluctant to swap at all, even if swap was
   configured. This means the LRU balancing code didn't come into play
   as often as it does now, and mostly in high pressure situations
   where pronounced swap activity wouldn't be as surprising.
 2. For historic reasons, shrink_lruvec() loops on the scan targets of
   all LRU lists except the active anon one, meaning it would bail if
   the only remaining pages to scan were active anon - even if there
   were a lot of them.
   Before the series starting with commit ccc5dc67340c ("mm/vmscan:
   make active/inactive ratio as 1:1 for anon lru"), most anon pages
   would live on the active LRU; the inactive one would contain only a
   handful of preselected reclaim candidates. After the series, anon
   gets aged similarly to file, and the inactive list is the default
   for new anon pages as well, making it often the much bigger list.
   As a result, the VM is now more likely to actually finish large
   anon targets than before.
 Change the code such that only one SWAP_CLUSTER_MAX-sized nudge toward the
 larger LRU lists is made before bailing out on a met reclaim goal.
 This fixes the extreme overreclaim problem.
 Fairness is more subtle and harder to evaluate.  No obvious misbehavior
 was observed on the test workload, in any case.  Conceptually, fairness
 should primarily be a cumulative effect from regular, lower priority
 scans.  Once the VM is in trouble and needs to escalate scan targets to
 make forward progress, fairness needs to take a backseat.  This is also
 acknowledged by the myriad exceptions in get_scan_count().  This patch
 makes fairness decrease gradually, as it keeps fairness work static over
 increasing priority levels with growing scan targets.  This should make
 more sense - although we may have to re-visit the exact values.
 Link: https://lkml.kernel.org/r/20220802162811.39216-1-hannes@cmpxchg.org
 Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
 Reviewed-by: Rik van Riel <riel@surriel.com>
 Acked-by: Mel Gorman <mgorman@techsingularity.net>
 Cc: Hugh Dickins <hughd@google.com>
 Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
 Cc: <stable@vger.kernel.org>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---
 mm/vmscan.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 382dbe97329f33..266eb8cfe93a67 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
@@ -2955,8 +2955,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	enum lru_list lru;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
 +	bool proportional_reclaim;
 	struct blk_plug plug;
 -	bool scan_adjusted;
 	get_scan_count(lruvec, sc, nr);
@@ -2974,8 +2974,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	 * abort proportional reclaim if either the file or anon lru has already
 	 * dropped to zero at the first pass.
 	 */
 -	scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
 -			 sc->priority == DEF_PRIORITY);
 +	proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
 +				sc->priority == DEF_PRIORITY);
 	blk_start_plug(&plug);
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2995,7 +2995,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 		cond_resched();
 -		if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
 +		if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
 			continue;
 		/*
@@ -3046,8 +3046,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 		nr_scanned = targets[lru] - nr[lru];
 		nr[lru] = targets[lru] * (100 - percentage) / 100;
 		nr[lru] -= min(nr[lru], nr_scanned);
 -
 -		scan_adjusted = true;
 	}
 	blk_finish_plug(&plug);
 	sc->nr_reclaimed += nr_reclaimed;
 From 430daaab3c78de6bd82f10cfb5a0f016c6e583f6 Mon Sep 17 00:00:00 2001
 From: Desmond Cheong Zhi Xi <desmondcheongzx@gmail.com>
 Date: Mon, 4 Oct 2021 14:07:34 -0400
--- a/linux-tkg-patches/6.1/0012-misc-additions.patch
+++ b/linux-tkg-patches/6.1/0012-misc-additions.patch
@@ -64,140 +64,6 @@ index 2c7171e0b0010..85de313ddec29 100644
 	select CPU_FREQ_GOV_PERFORMANCE
 	help
 From 2535fbde890f14c78b750139fcf87d1143850626 Mon Sep 17 00:00:00 2001
 From: Johannes Weiner <hannes@cmpxchg.org>
 Date: Tue, 2 Aug 2022 12:28:11 -0400
 Subject: [PATCH] mm: vmscan: fix extreme overreclaim and swap floods
 During proactive reclaim, we sometimes observe severe overreclaim, with
 several thousand times more pages reclaimed than requested.
 This trace was obtained from shrink_lruvec() during such an instance:
    prio:0 anon_cost:1141521 file_cost:7767
    nr_reclaimed:4387406 nr_to_reclaim:1047 (or_factor:4190)
    nr=[7161123 345 578 1111]
 While he reclaimer requested 4M, vmscan reclaimed close to 16G, most of it
 by swapping.  These requests take over a minute, during which the write()
 to memory.reclaim is unkillably stuck inside the kernel.
 Digging into the source, this is caused by the proportional reclaim
 bailout logic.  This code tries to resolve a fundamental conflict: to
 reclaim roughly what was requested, while also aging all LRUs fairly and
 in accordance to their size, swappiness, refault rates etc.  The way it
 attempts fairness is that once the reclaim goal has been reached, it stops
 scanning the LRUs with the smaller remaining scan targets, and adjusts the
 remainder of the bigger LRUs according to how much of the smaller LRUs was
 scanned.  It then finishes scanning that remainder regardless of the
 reclaim goal.
 This works fine if priority levels are low and the LRU lists are
 comparable in size.  However, in this instance, the cgroup that is
 targeted by proactive reclaim has almost no files left - they've already
 been squeezed out by proactive reclaim earlier - and the remaining anon
 pages are hot.  Anon rotations cause the priority level to drop to 0,
 which results in reclaim targeting all of anon (a lot) and all of file
 (almost nothing).  By the time reclaim decides to bail, it has scanned
 most or all of the file target, and therefor must also scan most or all of
 the enormous anon target.  This target is thousands of times larger than
 the reclaim goal, thus causing the overreclaim.
 The bailout code hasn't changed in years, why is this failing now?  The
 most likely explanations are two other recent changes in anon reclaim:
 1. Before the series starting with commit 5df741963d52 ("mm: fix LRU
   balancing effect of new transparent huge pages"), the VM was
   overall relatively reluctant to swap at all, even if swap was
   configured. This means the LRU balancing code didn't come into play
   as often as it does now, and mostly in high pressure situations
   where pronounced swap activity wouldn't be as surprising.
 2. For historic reasons, shrink_lruvec() loops on the scan targets of
   all LRU lists except the active anon one, meaning it would bail if
   the only remaining pages to scan were active anon - even if there
   were a lot of them.
   Before the series starting with commit ccc5dc67340c ("mm/vmscan:
   make active/inactive ratio as 1:1 for anon lru"), most anon pages
   would live on the active LRU; the inactive one would contain only a
   handful of preselected reclaim candidates. After the series, anon
   gets aged similarly to file, and the inactive list is the default
   for new anon pages as well, making it often the much bigger list.
   As a result, the VM is now more likely to actually finish large
   anon targets than before.
 Change the code such that only one SWAP_CLUSTER_MAX-sized nudge toward the
 larger LRU lists is made before bailing out on a met reclaim goal.
 This fixes the extreme overreclaim problem.
 Fairness is more subtle and harder to evaluate.  No obvious misbehavior
 was observed on the test workload, in any case.  Conceptually, fairness
 should primarily be a cumulative effect from regular, lower priority
 scans.  Once the VM is in trouble and needs to escalate scan targets to
 make forward progress, fairness needs to take a backseat.  This is also
 acknowledged by the myriad exceptions in get_scan_count().  This patch
 makes fairness decrease gradually, as it keeps fairness work static over
 increasing priority levels with growing scan targets.  This should make
 more sense - although we may have to re-visit the exact values.
 Link: https://lkml.kernel.org/r/20220802162811.39216-1-hannes@cmpxchg.org
 Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
 Reviewed-by: Rik van Riel <riel@surriel.com>
 Acked-by: Mel Gorman <mgorman@techsingularity.net>
 Cc: Hugh Dickins <hughd@google.com>
 Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
 Cc: <stable@vger.kernel.org>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---
 mm/vmscan.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 382dbe97329f33..266eb8cfe93a67 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
@@ -2955,8 +2955,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	enum lru_list lru;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
 +	bool proportional_reclaim;
 	struct blk_plug plug;
 -	bool scan_adjusted;
 	get_scan_count(lruvec, sc, nr);
@@ -2974,8 +2974,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	 * abort proportional reclaim if either the file or anon lru has already
 	 * dropped to zero at the first pass.
 	 */
 -	scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
 -			 sc->priority == DEF_PRIORITY);
 +	proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
 +				sc->priority == DEF_PRIORITY);
 	blk_start_plug(&plug);
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2995,7 +2995,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 		cond_resched();
 -		if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
 +		if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
 			continue;
 		/*
@@ -3046,8 +3046,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 		nr_scanned = targets[lru] - nr[lru];
 		nr[lru] = targets[lru] * (100 - percentage) / 100;
 		nr[lru] -= min(nr[lru], nr_scanned);
 -
 -		scan_adjusted = true;
 	}
 	blk_finish_plug(&plug);
 	sc->nr_reclaimed += nr_reclaimed;
 From 430daaab3c78de6bd82f10cfb5a0f016c6e583f6 Mon Sep 17 00:00:00 2001
 From: Desmond Cheong Zhi Xi <desmondcheongzx@gmail.com>
 Date: Mon, 4 Oct 2021 14:07:34 -0400
Author	SHA1	Message	Date
Sravan Balaji	51914badc5	PDS Kernel Configuration	2022-12-03 06:21:45 -05:00
Tk-Glitch	d4bace1676	linux 6.0.y: misc-additions: Remove `mm: vmscan: fix extreme overreclaim and swap floods` patch. Merged upstream.	2022-12-03 03:05:49 +01:00
timocapa	b108b3a8d2	linux 6.1rc: remove vmscan patch as it was merged upstream (#656 ) * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/mm/vmscan.c?id=f53af4285d775cd9a9a146fc438bd0a1bee1838a	2022-12-01 15:19:32 +01:00
Tk-Glitch	9949ffb1d0	Revert "Sync MGLRU with latest Yuzhao changes: (#654 )" This reverts commit `0b7c5dfa72`.	2022-11-24 17:10:45 +01:00
ViNi-Arco	0b7c5dfa72	Sync MGLRU with latest Yuzhao changes: (#654 ) Yuzhao posted his latest change to his personal repository.. https://github.com/yuzhaogoogle/linux/commits/mglru-6.0 Backup: http://web.archive.org/web/20221123173922/https://github.com/yuzhaogoogle/linux/commits/mglru-6.0	2022-11-24 15:42:21 +01:00
Adel Kara Slimane	75ea27821f	PKGBUILD: sourcing order fix (#653 ) * PKGBUILD: source prepare after the config files The git mirror wasn't configured with the user choice otherwise Fixes: #647 * install.sh: honor external config file	2022-11-23 15:13:07 +01:00
Tk-Glitch	f5cd3ea3b3	cfg: Enable _random_trust_cpu by default See https://github.com/Frogging-Family/linux-tkg/issues/646	2022-11-22 14:30:54 +01:00
ViNi-Arco	81c14e1950	Update tovic9 prjc sched to latest changes - 2: (#652 ) * Update prjc Torvic9 fork Which contains synchronization to the latest kernel with a possible solution for instability. * Clean white space	2022-11-22 02:02:59 +01:00