Compare commits

...

8 Commits

Author SHA1 Message Date
Sravan Balaji
3cc28c3531 PDS Kernel Configuration 2023-10-13 09:45:53 -04:00
Piotr Górski
18f10e2989 6.5: Add new EEVDF patches for BORE 3.2.0 (#826)
Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
2023-10-13 15:26:14 +02:00
FintasticMan
d1caaffa2e Add some more kernel modules to diet config (#814)
Partly taken from my database, partly from @ryanmusante's.
Also update the sorting to match what modprobed-db outputs, so that it's
easier to check which modules are missing from the diet config.
2023-10-09 22:12:49 +02:00
Yifan Zhu
097c2ed1b5 Correct comments for modprobed.db option (#823)
Prompting at build time was removed in commit 1a69f04e6e

Co-authored-by: Yifan Zhu <fanzhuyifan@gmail.com>
2023-10-09 22:11:00 +02:00
Tk-Glitch
cc5e53f109 6.6 RC: Add RX 7000 series Overdrive support patches to misc-additions
Imported from https://gitlab.freedesktop.org/agd5f/linux/-/tree/amd-staging-drm-next
Expected for 6.7. Tested on Navi32.
2023-09-27 08:33:05 +02:00
Tk-Glitch
37369b74eb 6.6 RC: Allow selecting the BORE CPU scheduler 2023-09-27 07:05:41 +02:00
Tk-Glitch
bf02edcc5e 6.6 RC: Refresh defconfig 2023-09-27 07:04:59 +02:00
ptr1337
a4c0ab6b9f Fixup (bore) EEVDF issue in #819 (#820)
Signed-off-by: Peter Jung <admin@ptr1337.dev>
2023-09-23 03:41:08 +02:00
6 changed files with 1299 additions and 740 deletions

View File

@@ -3,7 +3,7 @@
# Linux distribution you are using, options are "Arch", "Ubuntu", "Debian", "Fedora", "Suse", "Gentoo", "Generic".
# It is automatically set to "Arch" when using PKGBUILD.
# If left empty, the script will prompt
_distro=""
_distro="Arch"
# Kernel Version - x.x format without the subversion (will always grab latest available subversion) is recommended
# you can also set a specific kernel version, e.g. "6.0-rc4" or "5.10.51",
@@ -46,7 +46,7 @@ CUSTOM_GCC_PATH=""
CUSTOM_LLVM_PATH=""
# Set to true to bypass makepkg.conf and use all available threads for compilation. False will respect your makepkg.conf options.
_force_all_threads="true"
_force_all_threads="false"
# Set to true to prevent ccache from being used and set CONFIG_GCC_PLUGINS=y (which needs to be disabled for ccache to work properly)
_noccache="false"
@@ -59,17 +59,17 @@ _kernel_on_diet="false"
# Set to true to use modprobed db to clean config from unneeded modules. Speeds up compilation considerably. Requires root - https://wiki.archlinux.org/index.php/Modprobed-db
# Using this option can trigger user prompts if the config doesn't go smoothly.
# !!!! Make sure to have a well populated db !!!! - Leave empty to be asked about it at build time
# !!!! Make sure to have a well populated db !!!!
_modprobeddb="false"
# modprobed-db database file location
_modprobeddb_db_path=~/.config/modprobed.db
# Set to "1" to call make menuconfig, "2" to call make nconfig, "3" to call make xconfig, before building the kernel. Set to false to disable and skip the prompt.
_menunconfig=""
_menunconfig="false"
# Set to true to generate a kernel config fragment from your changes in menuconfig/nconfig. Set to false to disable and skip the prompt.
_diffconfig=""
_diffconfig="false"
# Set to the file name where the generated config fragment should be written to. Only used if _diffconfig is active.
_diffconfig_name=""
@@ -104,11 +104,11 @@ _STRIP="true"
# CPU scheduler - Options are "pds", "bmq", "cacule", "tt", "bore", "bore-eevdf", "eevdf" or "cfs" (kernel's default)
# "upds" (TkG's Undead PDS) and "muqss" are also available on legacy kernel revisions
_cpusched=""
_cpusched="pds"
# Compiler to use - Options are "gcc" or "llvm".
# For advanced users.
_compiler=""
_compiler="gcc"
# Force the use of the LLVM Integrated Assembler whether using LLVM, LTO or not.
# Set to "1" to enable.
@@ -138,7 +138,7 @@ _preempt_rt_force=""
# For BMQ: 0: No yield.
# 1: Deboost and requeue task. (Default)
# 2: Set rq skip task.
_sched_yield_type=""
_sched_yield_type="0"
# Round Robin interval is the longest duration two tasks with the same nice level will be delayed for. When CPU time is requested by a task, it receives a time slice equal
# to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low value can help offset the disadvantages of rescheduling a process that has yielded.
@@ -146,7 +146,7 @@ _sched_yield_type=""
# PDS default: 4ms"
# BMQ default: 2ms"
# Set to "1" for 2ms, "2" for 4ms, "3" for 6ms, "4" for 8ms, or "default" to keep the chosen scheduler defaults.
_rr_interval=""
_rr_interval="default"
# Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false"
_ftracedisable="false"
@@ -161,10 +161,10 @@ _misc_adds="true"
# Full tickless can give higher performances in case you use isolation of CPUs for tasks
# and it works only when using the nohz_full kernel parameter, otherwise behaves like idle.
# Just tickless idle perform better for most platforms.
_tickless=""
_tickless="2"
# Set to "true" to use ACS override patch - https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29 - Kernel default is "false"
_acs_override=""
_acs_override="false"
# Set to "true" to add Bcache filesystem support. You'll have to install bcachefs-tools-git from AUR for utilities - https://bcachefs.org/ - If in doubt, set to "false"
# This can be buggy and isn't recommended on a production machine, also enabling this option will not allow you to enable MGLRU.
@@ -175,7 +175,7 @@ _bcachefs="false"
_winesync="false"
# Set to "true" to enable Binder modules to use Waydroid Android containers
_waydroid=""
_waydroid="false"
# Various patches and tweaks from Zen/Liquorix, Xanmod and the community - Default is "true"
_glitched_base="true"
@@ -185,7 +185,7 @@ _glitched_base="true"
_zenify="true"
# compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "1"
_compileroptlevel="1"
_compileroptlevel="2"
# CPU compiler optimizations - Defaults to prompt at kernel config if left empty
# AMD CPUs : "k8" "k8sse3" "k10" "barcelona" "bobcat" "jaguar" "bulldozer" "piledriver" "steamroller" "excavator" "zen" "zen2" "zen3" "zen4" (zen3 opt support depends on GCC11) (zen4 opt support depends on GCC13)
@@ -199,7 +199,7 @@ _compileroptlevel="1"
# - "generic_v2" (depends on GCC11 - to share the package between machines with different CPU µarch supporting at least x86-64-v2
# - "generic_v3" (depends on GCC11 - to share the package between machines with different CPU µarch supporting at least x86-64-v3
# - "generic_v4" (depends on GCC11 - to share the package between machines with different CPU µarch supporting at least x86-64-v4
_processor_opt=""
_processor_opt="skylake"
# CacULE only - Enable Response Driven Balancer, an experimental load balancer for CacULE
_cacule_rdb="false"
@@ -212,13 +212,13 @@ _cacule_rdb_interval="19"
_tt_high_hz="false"
# MuQSS and PDS only - SMT (Hyperthreading) aware nice priority and policy support (SMT_NICE) - Kernel default is "true" - You can disable this on non-SMT/HT CPUs for lower overhead
_smt_nice=""
_smt_nice="true"
# Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false"
_random_trust_cpu="true"
# Timer frequency - "100" "250" "300" "500" "750" "1000" ("2000" is available for cacule cpusched only) - More options available in kernel config prompt when left empty depending on selected cpusched with the default option pointed with a ">" (2000 for cacule, 100 for muqss and 1000 for other cpu schedulers)
_timer_freq=""
_timer_freq="1000"
# Default CPU governor - "performance", "ondemand", "schedutil" or leave empty for default (schedutil)
_default_cpu_gov="ondemand"
@@ -234,7 +234,7 @@ _aggressive_ondemand="true"
_tcp_cong_alg=""
# You can pass a default set of kernel command line options here - example: "intel_pstate=passive nowatchdog amdgpu.ppfeaturemask=0xfffd7fff mitigations=off"
_custom_commandline="intel_pstate=passive split_lock_detect=off"
_custom_commandline=""
# Selection of Clearlinux patches
_clear_patches="true"

View File

@@ -25,9 +25,9 @@ algif_hash
algif_skcipher
alx
amd64_edac
amdgpu
amd_pmc
amd_pstate
amdgpu
amdxcp
apple_mfi_fastcharge
appletalk
@@ -54,8 +54,8 @@ bluetooth
bnep
bpf
bpf_preload
bridge
br_netfilter
bridge
btbcm
btcoexist
btintel
@@ -91,10 +91,10 @@ cmdlinepart
coretemp
cpufreq_ondemand
crc16
crc32c_generic
crc32c_intel
crc32_generic
crc32_pclmul
crc32c_generic
crc32c_intel
crc64
crc64_rocksoft
crc64_rocksoft_generic
@@ -109,13 +109,13 @@ cuse
dca
des_generic
dm_crypt
dmi_sysfs
dm_log
dm_mirror
dm_mod
dm_multipath
dm_region_hash
dm_round_robin
dmi_sysfs
dns_resolver
drm
drm_buddy
@@ -127,10 +127,10 @@ drm_ttm_helper
dvb_core
ebtable_filter
ebtables
ec_sys
ecb
ecc
ecdh_generic
ec_sys
edac_mce_amd
ee1004
eeepc_wmi
@@ -178,7 +178,6 @@ hid_logitech_hidpp
hid_microsoft
hid_multitouch
hid_nintendo
hidp
hid_playstation
hid_roccat
hid_roccat_common
@@ -187,6 +186,8 @@ hid_roccat_ryos
hid_sony
hid_steam
hid_wiimote
hidp
hp_wmi
hv_balloon
hv_netvsc
hv_storvsc
@@ -207,6 +208,8 @@ i2c_piix4
i2c_smbus
i8042
i915
iTCO_vendor_support
iTCO_wdt
ib_cm
ib_core
idma64
@@ -215,10 +218,12 @@ igc
inet_diag
input_leds
int3400_thermal
int3403_thermal
int340x_thermal_zone
intel_agp
intel_cstate
intel_gtt
intel_hid
intel_lpss
intel_lpss_pci
intel_pch_thermal
@@ -227,37 +232,36 @@ intel_powerclamp
intel_rapl_common
intel_rapl_msr
intel_soc_dts_iosf
intel_tcc_cooling
intel_uncore
intel_vsec
intel_wmi_thunderbolt
iommufd
iommu_v2
iommufd
ip6_tables
ip6_udp_tunnel
ip6t_REJECT
ip6t_rt
ip6table_filter
ip6table_mangle
ip6table_nat
ip6table_raw
ip6_tables
ip6table_security
ip6t_REJECT
ip6t_rt
ip6_udp_tunnel
ip_set
ip_tables
ipmi_devintf
ipmi_msghandler
ip_set
ipt_REJECT
iptable_filter
iptable_mangle
iptable_nat
iptable_raw
ip_tables
iptable_security
ipt_REJECT
ipv6
ir_kbd_i2c
irqbypass
isofs
it87
iTCO_vendor_support
iTCO_wdt
iw_cm
iwlmei
iwlmvm
@@ -301,15 +305,17 @@ mbcache
mc
mc44s803
md4
md_mod
mdio
mdio_devres
md_mod
mei
mei_gsc
mei_hdcp
mei_me
mei_pxp
mii
minix
mmc_core
mousedev
mptcp_diag
mrp
@@ -352,14 +358,14 @@ nf_nat_irc
nf_nat_pptp
nf_nat_sip
nf_nat_tftp
nf_reject_ipv4
nf_reject_ipv6
nf_tables
nfnetlink
nfnetlink_log
nfnetlink_queue
nf_reject_ipv4
nf_reject_ipv6
nfs
nfsv4
nf_tables
nft_chain_nat
nft_compat
nft_ct
@@ -383,6 +389,7 @@ nls_utf8
nouveau
ntfs
ntfs3
nvidia_wmi_ec_backlight
nvme
nvme_common
nvme_core
@@ -438,12 +445,18 @@ rt2800usb
rt2x00lib
rt2x00usb
rtl8192ee
rtl8723_common
rtl8723ae
rtl8723be
rtl8723_common
rtl8821ae
rtl_pci
rtlwifi
rtsx_pci
rtsx_pci_sdmmc
rtw88_8821c
rtw88_8821ce
rtw88_core
rtw88_pci
sch_cake
sch_fq_codel
sch_ingress
@@ -460,8 +473,11 @@ ses
sg
sha512_ssse3
snd
snd_acp3x_pdm_dma
snd_acp3x_rn
snd_acp6x_pdm_dma
snd_acp_config
snd_acp_pci
snd_aloop
snd_compress
snd_ctl_led
@@ -479,10 +495,12 @@ snd_intel_sdw_acpi
snd_pci_acp3x
snd_pci_acp5x
snd_pci_acp6x
snd_pci_ps
snd_pcm
snd_pcm_dmaengine
snd_rawmidi
snd_rn_pci_acp3x
snd_rpl_pci_acp6x
snd_seq
snd_seq_device
snd_seq_dummy
@@ -501,12 +519,14 @@ snd_soc_sst_dsp
snd_soc_sst_ipc
snd_sof
snd_sof_amd_acp
snd_sof_amd_rembrandt
snd_sof_amd_renoir
snd_sof_intel_hda
snd_sof_intel_hda_common
snd_sof_intel_hda_mlink
snd_sof_pci
snd_sof_pci_intel_cnl
snd_sof_pci_intel_tgl
snd_sof_probes
snd_sof_utils
snd_sof_xtensa_dsp
@@ -514,9 +534,9 @@ snd_timer
snd_ua101
snd_ump
snd_usb_audio
snd_usbmidi_lib
snd_usb_us122l
snd_usb_usx2y
snd_usbmidi_lib
snd_virmidi
soundcore
soundwire_bus
@@ -549,6 +569,7 @@ tea5767
tee
tg3
thermal
thunderbolt
tiny_power_button
tls
tpm
@@ -578,14 +599,14 @@ uio
uio_pdrv_genirq
unix_diag
usb_common
usb_storage
usb_wwan
usbcore
usbhid
usbip_core
usbip_host
usbmon
usbnet
usb_storage
usb_wwan
uvc
uvcvideo
uvesafb
@@ -618,9 +639,11 @@ vsock_loopback
wacom
watchdog
wireguard
wireless_hotkey
wmi
wmi_bmof
x86_pkg_temp_thermal
x_tables
xc2028
xc4000
xc5000
@@ -632,21 +655,20 @@ xhci_pci
xhci_pci_renesas
xor
xpad
x_tables
xt_CHECKSUM
xt_LOG
xt_MASQUERADE
xt_NFQUEUE
xt_REDIRECT
xt_addrtype
xt_cgroup
xt_CHECKSUM
xt_comment
xt_conntrack
xt_hl
xt_limit
xt_LOG
xt_mark
xt_MASQUERADE
xt_nat
xt_NFQUEUE
xt_recent
xt_REDIRECT
xt_state
xt_tcpudp
xxhash_generic

View File

@@ -1,6 +1,6 @@
#
# Automatically generated file; DO NOT EDIT.
# Linux/x86 6.6.0-rc1 Kernel Configuration
# Linux/x86 6.6.0-rc3 Kernel Configuration
#
CONFIG_CC_VERSION_TEXT="gcc (GCC) 13.2.1 20230801"
CONFIG_CC_IS_GCC=y
@@ -4701,7 +4701,6 @@ CONFIG_I2C_MUX_REG=m
CONFIG_I2C_MUX_MLXCPLD=m
# end of Multiplexer I2C Chip support
CONFIG_I2C_ATR=m
CONFIG_I2C_HELPER_AUTO=y
CONFIG_I2C_SMBUS=m
CONFIG_I2C_ALGOBIT=m
@@ -10276,8 +10275,8 @@ CONFIG_XFS_DRAIN_INTENTS=y
CONFIG_XFS_ONLINE_SCRUB=y
CONFIG_XFS_ONLINE_SCRUB_STATS=y
CONFIG_XFS_ONLINE_REPAIR=y
# CONFIG_XFS_WARN is not set
# CONFIG_XFS_DEBUG is not set
CONFIG_XFS_DEBUG=y
CONFIG_XFS_ASSERT_FATAL=y
CONFIG_GFS2_FS=m
CONFIG_GFS2_FS_LOCKING_DLM=y
CONFIG_OCFS2_FS=m

View File

@@ -257,7 +257,7 @@ _set_cpu_scheduler() {
["upds"]="Undead PDS (TkG)"
["cacule"]="CacULE"
["tt"]="TT (TaskType)"
["bore"]="BORE (Burst-Oriented Response Enhancer - CFS variant) CPU Scheduler"
["bore"]="BORE (Burst-Oriented Response Enhancer) CPU Scheduler"
["bore-eevdf"]="BORE (Burst-Oriented Response Enhancer - EEVDF variant) CPU Scheduler"
["eevdf"]="Earliest Eligible Virtual Deadline First (EEVDF) scheduler"
)
@@ -302,7 +302,7 @@ _set_cpu_scheduler() {
elif [ "$_kver" = "605" ]; then
_avail_cpu_scheds=("cfs" "eevdf" "pds" "bmq" "tt" "bore" "bore-eevdf")
elif [ "$_kver" = "606" ]; then
_avail_cpu_scheds=("eevdf")
_avail_cpu_scheds=("eevdf" "bore")
else
_avail_cpu_scheds=("cfs")
fi
@@ -859,11 +859,7 @@ _tkg_srcprep() {
if [ "${_cpusched}" = "bore-eevdf" ]; then
_msg="Applying BORE-EEVDF patch"
if [ "$_kver" != "605" ]; then
curl "https://raw.githubusercontent.com/CachyOS/kernel-patches/master/${_basekernel}/sched/0001-bore-eevdf.patch" > "$srcdir"/0001-bore-eevdf.patch
else
curl "https://raw.githubusercontent.com/sirlucjan/kernel-patches/master/${_basekernel}/bore-eevdf-patches-v2-sep/0016-linux6.5-bore3.1.3.patch" > "$srcdir"/0001-bore-eevdf.patch
fi
tkgpatch="$srcdir/0001-bore-eevdf.patch" && _tkg_patcher
fi
fi

View File

@@ -1,7 +1,7 @@
From 6f9fee6b2a2ceb4561a58c152467fd5e6d5c47e8 Mon Sep 17 00:00:00 2001
From d931ed7fc8d6728204d36d31a18d4c8b60593821 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 31 May 2023 13:58:40 +0200
Subject: [PATCH 01/15] sched/fair: Add cfs_rq::avg_vruntime
Subject: [PATCH 01/16] sched/fair: Add cfs_rq::avg_vruntime
In order to move to an eligibility based scheduling policy, we need
to have a better approximation of the ideal scheduler.
@@ -295,10 +295,10 @@ index e93e006a9..4ccb73d85 100644
2.42.0
From 826b8e2df1d3c69e138c6c89f6872df2be4ad1cb Mon Sep 17 00:00:00 2001
From 4e5d4ab816239fc30595a76ffcd41c323bdd4996 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 31 May 2023 13:58:41 +0200
Subject: [PATCH 02/15] sched/fair: Remove sched_feat(START_DEBIT)
Subject: [PATCH 02/16] sched/fair: Remove sched_feat(START_DEBIT)
With the introduction of avg_vruntime() there is no need to use worse
approximations. Take the 0-lag point as starting point for inserting
@@ -372,10 +372,10 @@ index ee7f23c76..fa828b365 100644
2.42.0
From 5672ddd48026e6f590a9eae4d122bb0eed50e109 Mon Sep 17 00:00:00 2001
From 49ba3e84689bb047d9411e8a3a6ae99020070f37 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 31 May 2023 13:58:42 +0200
Subject: [PATCH 03/15] sched/fair: Add lag based placement
Subject: [PATCH 03/16] sched/fair: Add lag based placement
With the introduction of avg_vruntime, it is possible to approximate
lag (the entire purpose of introducing it in fact). Use this to do lag
@@ -410,7 +410,7 @@ index 609bde814..52910bfb9 100644
u64 nr_migrations;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c52c2eba7..3bb4df5bb 100644
index e8f73ff12..acb9d9ff3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4501,6 +4501,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
@@ -661,10 +661,10 @@ index fa828b365..7958a10fe 100644
2.42.0
From e9818f093795a5d7b1ee08248d8db84ed88411dd Mon Sep 17 00:00:00 2001
From 31462b52019e938357395e7bd0f630fcd550e27c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 31 May 2023 13:58:43 +0200
Subject: [PATCH 04/15] rbtree: Add rb_add_augmented_cached() helper
Subject: [PATCH 04/16] rbtree: Add rb_add_augmented_cached() helper
While slightly sub-optimal, updating the augmented data while going
down the tree during lookup would be faster -- alas the augment
@@ -719,10 +719,10 @@ index 7ee7ed5de..6dbc5a1bf 100644
2.42.0
From cb798272c085050f0db104befcf8092da0931210 Mon Sep 17 00:00:00 2001
From e8c55c05618756cf090470c355f2864dafe0a618 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 31 May 2023 13:58:44 +0200
Subject: [PATCH 05/15] sched/fair: Implement an EEVDF-like scheduling policy
Subject: [PATCH 05/16] sched/fair: Implement an EEVDF-like scheduling policy
Where CFS is currently a WFQ based scheduler with only a single knob,
the weight. The addition of a second, latency oriented parameter,
@@ -785,7 +785,7 @@ index 52910bfb9..35331c35f 100644
u64 nr_migrations;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bb4df5bb..d7291206f 100644
index acb9d9ff3..427d694ff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4502,6 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
@@ -1398,10 +1398,10 @@ index 4ccb73d85..1fc81dd7f 100644
2.42.0
From 792befe9ba4d972eeb1ba144cfa3062e48fd98ff Mon Sep 17 00:00:00 2001
From 6aa7145ce28656863846e7f67ad98e3ed89473f3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 31 May 2023 13:58:45 +0200
Subject: [PATCH 06/15] sched/fair: Commit to lag based placement
Subject: [PATCH 06/16] sched/fair: Commit to lag based placement
Removes the FAIR_SLEEPERS code in favour of the new LAG based
placement.
@@ -1526,10 +1526,10 @@ index 60cce1e6f..2a830eccd 100644
2.42.0
From 26b3a580ff53a5b6e3b01810b7f223b672cab5e9 Mon Sep 17 00:00:00 2001
From 12c67a50f08fe4b97fda8f13302e2574e10351c7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 31 May 2023 13:58:46 +0200
Subject: [PATCH 07/15] sched/smp: Use lag to simplify cross-runqueue placement
Subject: [PATCH 07/16] sched/smp: Use lag to simplify cross-runqueue placement
Using lag is both more correct and simpler when moving between
runqueues.
@@ -1793,10 +1793,10 @@ index 91f25d6c8..b7daccfb2 100644
2.42.0
From 35645d3d36593126531a3ee2f7402c9acfdb3e6d Mon Sep 17 00:00:00 2001
From 8e2fcd5cb320987439faec8442f7f73ccb234875 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 31 May 2023 13:58:47 +0200
Subject: [PATCH 08/15] sched/fair: Commit to EEVDF
Subject: [PATCH 08/16] sched/fair: Commit to EEVDF
EEVDF is a better defined scheduling policy, as a result it has less
heuristics/tunables. There is no compelling reason to keep CFS around.
@@ -2569,10 +2569,10 @@ index 1fc81dd7f..83bbcd35c 100644
2.42.0
From a1bff7f7a7608a50d8b1108e68f766daa920b4a3 Mon Sep 17 00:00:00 2001
From 55aa8349238fbe34a1f8198d56210a5e773851f1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 31 May 2023 13:58:48 +0200
Subject: [PATCH 09/15] sched/debug: Rename sysctl_sched_min_granularity to
Subject: [PATCH 09/16] sched/debug: Rename sysctl_sched_min_granularity to
sysctl_sched_base_slice
EEVDF uses this tunable as the base request/slice -- make sure the
@@ -2589,7 +2589,7 @@ Link: https://lore.kernel.org/r/20230531124604.205287511@infradead.org
4 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d7291206f..8116ef56d 100644
index 427d694ff..be77d999d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4502,7 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
@@ -2685,10 +2685,10 @@ index 83bbcd35c..e21f6a048 100644
2.42.0
From 5ef098d5a57aa3f3a054935b500d694c4027834a Mon Sep 17 00:00:00 2001
From d059ffad9f9729ec63ad32fc3840a1a308cbd8a7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 31 May 2023 13:58:49 +0200
Subject: [PATCH 10/15] sched/fair: Propagate enqueue flags into place_entity()
Subject: [PATCH 10/16] sched/fair: Propagate enqueue flags into place_entity()
This allows place_entity() to consider ENQUEUE_WAKEUP and
ENQUEUE_MIGRATED.
@@ -2766,10 +2766,10 @@ index e21f6a048..576d371c8 100644
2.42.0
From 30054e00408a19d0a9ba9c2682217544b09c4937 Mon Sep 17 00:00:00 2001
From 80cdbd469974a44e5150be88f5c696ec241f6087 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Aug 2023 15:40:59 +0200
Subject: [PATCH 11/15] sched/eevdf: Curb wakeup-preemption
Subject: [PATCH 11/16] sched/eevdf: Curb wakeup-preemption
Mike and others noticed that EEVDF does like to over-schedule quite a
bit -- which does hurt performance of a number of benchmarks /
@@ -2858,10 +2858,10 @@ index 54334ca5c..546d212ef 100644
2.42.0
From 3e8371461b6d790eb57788495c157d3092ae4ce9 Mon Sep 17 00:00:00 2001
From 7d5bf4ed3cc74835a55db18eead11af61557a795 Mon Sep 17 00:00:00 2001
From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Date: Thu, 24 Aug 2023 13:33:42 +0530
Subject: [PATCH 12/15] sched/eevdf/doc: Modify the documented knob to
Subject: [PATCH 12/16] sched/eevdf/doc: Modify the documented knob to
base_slice_ns as well
After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns'
@@ -2897,10 +2897,10 @@ index 03db55504..f68919800 100644
2.42.0
From edbc7fe6658db891c80f244dc397f4e0247f6f3d Mon Sep 17 00:00:00 2001
From bff784de63e9a8567d91b630e8f2bf842aef894b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Sep 2023 00:48:55 +0200
Subject: [PATCH 13/15] sched/eevdf: Also update slice on placement
Subject: [PATCH 13/16] sched/eevdf: Also update slice on placement
Tasks that never consume their full slice would not update their slice value.
This means that tasks that are spawned before the sysctl scaling keep their
@@ -2908,6 +2908,7 @@ original (UP) slice length.
Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230915124822.847197830@noisy.programming.kicks-ass.net
---
kernel/sched/fair.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
@@ -2935,475 +2936,259 @@ index 1cdc95725..efbcdc69c 100644
2.42.0
From 0f1fadfb03ba9ba181e4631de8cd97ba765fae1d Mon Sep 17 00:00:00 2001
From 163619e41993d6e481a745466c05cc0dfb3dcda8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Sep 2023 00:48:45 +0200
Subject: [PATCH 14/15] sched/eevdf: Delay dequeue
Date: Tue, 26 Sep 2023 14:29:50 +0200
Subject: [PATCH 14/16] sched/eevdf: Fix avg_vruntime()
For tasks that have negative-lag (have received 'excess' service), delay the
dequeue and keep them in the runnable tree until they're elegible again. Or
rather, keep them until they're selected again, since finding their elegibility
crossover point is expensive.
The expectation is that placing a task at avg_vruntime() makes it
eligible. Turns out there is a corner case where this is not the case.
The effect is a bit like sleeper bonus, the tasks keep contending for service
until either they get a wakeup or until they're selected again and are really
dequeued.
Specifically, avg_vruntime() relies on the fact that integer division
is a flooring function (eg. it discards the remainder). By this
property the value returned is slightly left of the true average.
This means that any actual dequeue happens with positive lag (serviced owed)
and are more readily ran when wakeup.
However! when the average is a negative (relative to min_vruntime) the
effect is flipped and it becomes a ceil, with the result that the
returned value is just right of the average and thus not eligible.
Fixes: af4cf40470c2 ("sched/fair: Add cfs_rq::avg_vruntime")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched.h | 1 +
kernel/sched/core.c | 41 +++++++++++++++++++++++++++++++++++------
kernel/sched/fair.c | 9 +++++++++
kernel/sched/features.h | 1 +
kernel/sched/sched.h | 3 ++-
5 files changed, 48 insertions(+), 7 deletions(-)
kernel/sched/fair.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 35331c35f..d40d98313 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -891,6 +891,7 @@ struct task_struct {
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
+ unsigned sched_delayed:1;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index efbcdc69c..9dbf3ce61 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -664,6 +664,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
}
/* Force alignment to the next boundary: */
unsigned :0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8116ef56d..cfb0ffa69 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6551,6 +6551,16 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
# define SM_MASK_PREEMPT SM_PREEMPT
#endif
+/*
+ * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * For this to be so, the result of this function must have a left bias.
+ */
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
@@ -677,8 +681,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
load += weight;
}
+static void __deschedule_task(struct rq *rq, struct task_struct *p)
+{
+ deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+
+ if (p->in_iowait) {
+ atomic_inc(&rq->nr_iowait);
+ delayacct_blkio_start();
- if (load)
+ if (load) {
+ /* sign flips effective floor / ceil */
+ if (avg < 0)
+ avg -= (load - 1);
avg = div_s64(avg, load);
+ }
+}
+
/*
* __schedule() is the main scheduler function.
*
@@ -6663,17 +6673,36 @@ static void __sched notrace __schedule(unsigned int sched_mode)
*
* After this, schedule() must not care about p->state any more.
return cfs_rq->min_vruntime + avg;
}
--
2.42.0
From 217895647edb558ce9b28d0e07418f66fdaf85bc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 6 Oct 2023 21:24:45 +0200
Subject: [PATCH 15/16] sched/eevdf: Fix min_deadline heap integrity
Marek and Biju reported instances of:
"EEVDF scheduling fail, picking leftmost"
which Mike correlated with cgroup scheduling and the min_deadline heap
getting corrupted; some trace output confirms:
> And yeah, min_deadline is hosed somehow:
>
> validate_cfs_rq: --- /
> __print_se: ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -66302085 E (11372/tr)
> __print_se: ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -66302085 E (-1//autogroup-31)
> __print_se: ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N (-1//autogroup-33)
> validate_cfs_rq: min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256
Turns out that reweight_entity(), which tries really hard to be fast,
does not do the normal dequeue+update+enqueue pattern but *does* scale
the deadline.
However, it then fails to propagate the updated deadline value up the
heap.
Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reported-by: Biju Das <biju.das.jz@bp.renesas.com>
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231006192445.GE743@noisy.programming.kicks-ass.net
---
kernel/sched/fair.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9dbf3ce61..a0f1d9578 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3612,6 +3612,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
*/
- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+ if (!(sched_feat(DELAY_DEQUEUE) &&
+ prev->sched_class->eligible_task &&
+ !prev->sched_class->eligible_task(rq, prev)))
+ __deschedule_task(rq, prev);
+ else
+ prev->sched_delayed = 1;
+ }
+ switch_count = &prev->nvcsw;
+ }
+
+ for (struct task_struct *tmp = prev;;) {
deadline = div_s64(deadline * old_weight, weight);
se->deadline = se->vruntime + deadline;
+ min_deadline_cb_propagate(&se->run_node, NULL);
}
- if (prev->in_iowait) {
- atomic_inc(&rq->nr_iowait);
- delayacct_blkio_start();
+ next = pick_next_task(rq, tmp, &rf);
+ if (unlikely(tmp != prev))
+ finish_task(tmp);
+
+ if (sched_feat(DELAY_DEQUEUE) && unlikely(next->sched_delayed)) {
+ next->sched_delayed = 0;
+ if (READ_ONCE(next->__state)) {
+ prepare_task(next);
+ smp_wmb();
+ __deschedule_task(rq, next);
+ tmp = next;
+ continue;
#ifdef CONFIG_SMP
--
2.42.0
From 71f1c08f8102e48a5235bb145af59edfa597cf72 Mon Sep 17 00:00:00 2001
From: Benjamin Segall <bsegall@google.com>
Date: Fri, 29 Sep 2023 17:09:30 -0700
Subject: [PATCH 16/16] sched/eevdf: Fix pick_eevdf()
The old pick_eevdf() could fail to find the actual earliest eligible
deadline when it descended to the right looking for min_deadline, but
it turned out that that min_deadline wasn't actually eligible. In that
case we need to go back and search through any left branches we
skipped looking for the actual best _eligible_ min_deadline.
This is more expensive, but still O(log n), and at worst should only
involve descending two branches of the rbtree.
I've run this through a userspace stress test (thank you
tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
corner cases.
Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/xm261qego72d.fsf_-_@google.com
---
kernel/sched/fair.c | 72 ++++++++++++++++++++++++++++++++++++---------
1 file changed, 58 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a0f1d9578..caec9b43c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -872,14 +872,16 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
*
* Which allows an EDF like search on (sub)trees.
*/
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;
+ struct sched_entity *best_left = NULL;
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
+ best = curr;
/*
* Once selected, run a task until it either becomes non-eligible or
@@ -900,33 +902,75 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
}
}
- switch_count = &prev->nvcsw;
/*
- * If this entity has an earlier deadline than the previous
- * best, take this one. If it also has the earliest deadline
- * of its subtree, we're done.
+ * Now we heap search eligible trees for the best (min_)deadline
*/
- if (!best || deadline_gt(deadline, best, se)) {
+ if (!best || deadline_gt(deadline, best, se))
best = se;
- if (best->deadline == best->min_deadline)
- break;
- }
/*
- * If the earlest deadline in this subtree is in the fully
- * eligible left half of our space, go there.
+ * Every se in a left branch is eligible, keep track of the
+ * branch with the best min_deadline
*/
+ if (node->rb_left) {
+ struct sched_entity *left = __node_2_se(node->rb_left);
+
+ if (!best_left || deadline_gt(min_deadline, best_left, left))
+ best_left = left;
+
+ /*
+ * min_deadline is in the left branch. rb_left and all
+ * descendants are eligible, so immediately switch to the second
+ * loop.
+ */
+ if (left->min_deadline == se->min_deadline)
+ break;
}
- next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index efbcdc69c..729507e40 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8174,6 +8174,14 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
return pick_next_task_fair(rq, NULL, NULL);
}
+static bool eligible_task_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ }
+
+ return entity_eligible(cfs_rq, se);
+ /* min_deadline is at this node, no need to look right */
+ if (se->deadline == se->min_deadline)
+ break;
+
+ /* else min_deadline is in the right branch. */
+ node = node->rb_right;
+ }
+
+ /*
+ * We ran into an eligible node which is itself the best.
+ * (Or nr_running == 0 and both are NULL)
+ */
+ if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
+ return best;
+
+ /*
+ * Now best_left and all of its children are eligible, and we are just
+ * looking for deadline == min_deadline
+ */
+ node = &best_left->run_node;
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ /* min_deadline is the current node */
+ if (se->deadline == se->min_deadline)
+ return se;
+
+ /* min_deadline is in the left branch */
if (node->rb_left &&
__node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
node = node->rb_left;
continue;
}
+ /* else min_deadline is in the right branch */
node = node->rb_right;
}
+ return NULL;
+}
+
/*
* Account for a descheduled task:
*/
@@ -12628,6 +12636,7 @@ DEFINE_SCHED_CLASS(fair) = {
.check_preempt_curr = check_preempt_wakeup,
- if (!best || (curr && deadline_gt(deadline, best, curr)))
- best = curr;
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se = __pick_eevdf(cfs_rq);
+ .eligible_task = eligible_task_fair,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 546d212ef..5ae5a6f92 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -7,6 +7,7 @@
SCHED_FEAT(PLACE_LAG, true)
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
SCHED_FEAT(RUN_TO_PARITY, true)
+SCHED_FEAT(DELAY_DEQUEUE, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 576d371c8..c18ab7c2f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2219,6 +2219,7 @@ struct sched_class {
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
+ bool (*eligible_task)(struct rq *rq, struct task_struct *p);
struct task_struct *(*pick_next_task)(struct rq *rq);
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
@@ -2272,7 +2273,7 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- WARN_ON_ONCE(rq->curr != prev);
+// WARN_ON_ONCE(rq->curr != prev);
prev->sched_class->put_prev_task(rq, prev);
- if (unlikely(!best)) {
+ if (!se) {
struct sched_entity *left = __pick_first_entity(cfs_rq);
if (left) {
pr_err("EEVDF scheduling fail, picking leftmost\n");
@@ -934,7 +978,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
}
}
--
2.42.0
From 4aba3e1c3bbe4a36d4b9e405be8a66d7c10d6495 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 May 2023 13:46:30 +0200
Subject: [PATCH 15/15] sched/eevdf: Use sched_attr::sched_runtime to set
request/slice suggestion
Allow applications to directly set a suggested request/slice length using
sched_attr::sched_runtime.
The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
Applications should strive to use their periodic runtime at a high
confidence interval (95%+) as the target slice. Using a smaller slice
will introduce undue preemptions, while using a larger value will
increase latency.
For all the following examples assume a scheduling quantum of 8, and for
consistency all examples have W=4:
{A,B,C,D}(w=1,r=8):
ABCD...
+---+---+---+---
t=0, V=1.5 t=1, V=3.5
A |------< A |------<
B |------< B |------<
C |------< C |------<
D |------< D |------<
---+*------+-------+--- ---+--*----+-------+---
t=2, V=5.5 t=3, V=7.5
A |------< A |------<
B |------< B |------<
C |------< C |------<
D |------< D |------<
---+----*--+-------+--- ---+------*+-------+---
Note: 4 identical tasks in FIFO order
~~~
{A,B}(w=1,r=16) C(w=2,r=16)
AACCBBCC...
+---+---+---+---
t=0, V=1.25 t=2, V=5.25
A |--------------< A |--------------<
B |--------------< B |--------------<
C |------< C |------<
---+*------+-------+--- ---+----*--+-------+---
t=4, V=8.25 t=6, V=12.25
A |--------------< A |--------------<
B |--------------< B |--------------<
C |------< C |------<
---+-------*-------+--- ---+-------+---*---+---
Note: 1 heavy task -- because q=8, double r such that the deadline of the w=2
task doesn't go below q.
Note: observe the full schedule becomes: W*max(r_i/w_i) = 4*2q = 8q in length.
Note: the period of the heavy task is half the full period at:
W*(r_i/w_i) = 4*(2q/2) = 4q
~~~
{A,C,D}(w=1,r=16) B(w=1,r=8):
BAACCBDD...
+---+---+---+---
t=0, V=1.5 t=1, V=3.5
A |--------------< A |---------------<
B |------< B |------<
C |--------------< C |--------------<
D |--------------< D |--------------<
---+*------+-------+--- ---+--*----+-------+---
t=3, V=7.5 t=5, V=11.5
A |---------------< A |---------------<
B |------< B |------<
C |--------------< C |--------------<
D |--------------< D |--------------<
---+------*+-------+--- ---+-------+--*----+---
t=6, V=13.5
A |---------------<
B |------<
C |--------------<
D |--------------<
---+-------+----*--+---
Note: 1 short task -- again double r so that the deadline of the short task
won't be below q. Made B short because its not the leftmost task, but is
eligible with the 0,1,2,3 spread.
Note: like with the heavy task, the period of the short task observes:
W*(r_i/w_i) = 4*(1q/1) = 4q
~~~
A(w=1,r=16) B(w=1,r=8) C(w=2,r=16)
BCCAABCC...
+---+---+---+---
t=0, V=1.25 t=1, V=3.25
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+*------+-------+--- ---+--*----+-------+---
t=3, V=7.25 t=5, V=11.25
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+------*+-------+--- ---+-------+--*----+---
t=6, V=13.25
A |--------------<
B |------<
C |------<
---+-------+----*--+---
Note: 1 heavy and 1 short task -- combine them all.
Note: both the short and heavy task end up with a period of 4q
~~~
A(w=1,r=16) B(w=2,r=16) C(w=1,r=8)
BBCAABBC...
+---+---+---+---
t=0, V=1 t=2, V=5
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+*------+-------+--- ---+----*--+-------+---
t=3, V=7 t=5, V=11
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+------*+-------+--- ---+-------+--*----+---
t=7, V=15
A |--------------<
B |------<
C |------<
---+-------+------*+---
Note: as before but permuted
~~~
From all this it can be deduced that, for the steady state:
- the total period (P) of a schedule is: W*max(r_i/w_i)
- the average period of a task is: W*(r_i/w_i)
- each task obtains the fair share: w_i/W of each full period P
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched.h | 3 +++
kernel/sched/core.c | 33 ++++++++++++++++++++++++++-------
kernel/sched/fair.c | 6 ++++--
3 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d40d98313..93c03b162 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -555,6 +555,9 @@ struct sched_entity {
struct list_head group_node;
unsigned int on_rq;
+ unsigned int custom_slice : 1;
+ /* 31 bits hole */
+
u64 exec_start;
u64 sum_exec_runtime;
u64 prev_sum_exec_runtime;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cfb0ffa69..1ae5a8272 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4502,7 +4502,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
- p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4756,6 +4755,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
+ p->se.custom_slice = 0;
+ p->se.slice = sysctl_sched_base_slice;
/*
* We don't need the reset flag anymore after the fork. It has
@@ -7556,10 +7557,20 @@ static void __setscheduler_params(struct task_struct *p,
p->policy = policy;
- if (dl_policy(policy))
+ if (dl_policy(policy)) {
__setparam_dl(p, attr);
- else if (fair_policy(policy))
+ } else if (fair_policy(policy)) {
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ p->se.custom_slice = 1;
+ p->se.slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ p->se.custom_slice = 0;
+ p->se.slice = sysctl_sched_base_slice;
+ }
+ }
/*
* __sched_setscheduler() ensures attr->sched_priority == 0 when
@@ -7744,7 +7755,9 @@ static int __sched_setscheduler(struct task_struct *p,
* but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ if (fair_policy(policy) &&
+ (attr->sched_nice != task_nice(p) ||
+ (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
@@ -7890,6 +7903,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
+ if (p->se.custom_slice)
+ attr.sched_runtime = p->se.slice;
+
/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
@@ -8066,12 +8082,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
- if (task_has_dl_policy(p))
+ if (task_has_dl_policy(p)) {
__getparam_dl(p, attr);
- else if (task_has_rt_policy(p))
+ } else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
- else
+ } else {
attr->sched_nice = task_nice(p);
+ attr->sched_runtime = p->se.slice;
+ }
}
/**
@@ -10090,6 +10108,7 @@ void __init sched_init(void)
}
set_load_weight(&init_task, false);
+ init_task.se.slice = sysctl_sched_base_slice,
/*
* The boot idle thread does lazy MMU switching as well:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 729507e40..51e19a1fb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -973,7 +973,8 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
* nice) while the request time r_i is determined by
* sysctl_sched_base_slice.
*/
- se->slice = sysctl_sched_base_slice;
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
/*
* EEVDF: vd_i = ve_i + r_i / w_i
@@ -4921,7 +4922,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
u64 vslice, vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;
- se->slice = sysctl_sched_base_slice;
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
vslice = calc_delta_fair(se->slice, se);
/*
- return best;
+ return se;
}
#ifdef CONFIG_SCHED_DEBUG
--
2.42.0

View File

@@ -64,3 +64,760 @@ index 2c7171e0b0010..85de313ddec29 100644
select CPU_FREQ_GOV_PERFORMANCE
help
From 7695eb71d0872ed9633daf0ca779da3344b87dec Mon Sep 17 00:00:00 2001
From: Evan Quan <evan.quan@amd.com>
Date: Mon, 21 Aug 2023 14:15:13 +0800
Subject: [PATCH] drm/amd/pm: correct SMU13 gfx voltage related OD settings
The voltage offset setting will be applied to the whole v/f curve line
instead of per anchor point base.
Signed-off-by: Evan Quan <evan.quan@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
drivers/gpu/drm/amd/pm/amdgpu_pm.c | 45 +++++++------------
.../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 31 ++++++-------
.../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 31 ++++++-------
3 files changed, 43 insertions(+), 64 deletions(-)
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 1da7ece4c627..06aa5c18b40f 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -643,18 +643,14 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,
* They can be used to calibrate the sclk voltage curve. This is
* available for Vega20 and NV1X.
*
- * - voltage offset for the six anchor points of the v/f curve labeled
- * OD_VDDC_CURVE. They can be used to calibrate the v/f curve. This
- * is only availabe for some SMU13 ASICs.
- *
* - voltage offset(in mV) applied on target voltage calculation.
- * This is available for Sienna Cichlid, Navy Flounder and Dimgrey
- * Cavefish. For these ASICs, the target voltage calculation can be
- * illustrated by "voltage = voltage calculated from v/f curve +
- * overdrive vddgfx offset"
+ * This is available for Sienna Cichlid, Navy Flounder, Dimgrey
+ * Cavefish and some later SMU13 ASICs. For these ASICs, the target
+ * voltage calculation can be illustrated by "voltage = voltage
+ * calculated from v/f curve + overdrive vddgfx offset"
*
- * - a list of valid ranges for sclk, mclk, and voltage curve points
- * labeled OD_RANGE
+ * - a list of valid ranges for sclk, mclk, voltage curve points
+ * or voltage offset labeled OD_RANGE
*
* < For APUs >
*
@@ -686,24 +682,17 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,
* E.g., "p 2 0 800" would set the minimum core clock on core
* 2 to 800Mhz.
*
- * For sclk voltage curve,
- * - For NV1X, enter the new values by writing a string that
- * contains "vc point clock voltage" to the file. The points
- * are indexed by 0, 1 and 2. E.g., "vc 0 300 600" will update
- * point1 with clock set as 300Mhz and voltage as 600mV. "vc 2
- * 1000 1000" will update point3 with clock set as 1000Mhz and
- * voltage 1000mV.
- * - For SMU13 ASICs, enter the new values by writing a string that
- * contains "vc anchor_point_index voltage_offset" to the file.
- * There are total six anchor points defined on the v/f curve with
- * index as 0 - 5.
- * - "vc 0 10" will update the voltage offset for point1 as 10mv.
- * - "vc 5 -10" will update the voltage offset for point6 as -10mv.
- *
- * To update the voltage offset applied for gfxclk/voltage calculation,
- * enter the new value by writing a string that contains "vo offset".
- * This is supported by Sienna Cichlid, Navy Flounder and Dimgrey Cavefish.
- * And the offset can be a positive or negative value.
+ * For sclk voltage curve supported by Vega20 and NV1X, enter the new
+ * values by writing a string that contains "vc point clock voltage"
+ * to the file. The points are indexed by 0, 1 and 2. E.g., "vc 0 300
+ * 600" will update point1 with clock set as 300Mhz and voltage as 600mV.
+ * "vc 2 1000 1000" will update point3 with clock set as 1000Mhz and
+ * voltage 1000mV.
+ *
+ * For voltage offset supported by Sienna Cichlid, Navy Flounder, Dimgrey
+ * Cavefish and some later SMU13 ASICs, enter the new value by writing a
+ * string that contains "vo offset". E.g., "vo -10" will update the extra
+ * voltage offset applied to the whole v/f curve line as -10mv.
*
* - When you have edited all of the states as needed, write "c" (commit)
* to the file to commit your changes
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 3903a47669e4..bd0d5f027cac 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -1304,16 +1304,14 @@ static int smu_v13_0_0_print_clk_levels(struct smu_context *smu,
od_table->OverDriveTable.UclkFmax);
break;
- case SMU_OD_VDDC_CURVE:
+ case SMU_OD_VDDGFX_OFFSET:
if (!smu_v13_0_0_is_od_feature_supported(smu,
PP_OD_FEATURE_GFX_VF_CURVE_BIT))
break;
- size += sysfs_emit_at(buf, size, "OD_VDDC_CURVE:\n");
- for (i = 0; i < PP_NUM_OD_VF_CURVE_POINTS; i++)
- size += sysfs_emit_at(buf, size, "%d: %dmv\n",
- i,
- od_table->OverDriveTable.VoltageOffsetPerZoneBoundary[i]);
+ size += sysfs_emit_at(buf, size, "OD_VDDGFX_OFFSET:\n");
+ size += sysfs_emit_at(buf, size, "%dmV\n",
+ od_table->OverDriveTable.VoltageOffsetPerZoneBoundary[0]);
break;
case SMU_OD_RANGE:
@@ -1355,7 +1353,7 @@ static int smu_v13_0_0_print_clk_levels(struct smu_context *smu,
PP_OD_FEATURE_GFX_VF_CURVE,
&min_value,
&max_value);
- size += sysfs_emit_at(buf, size, "VDDC_CURVE: %7dmv %10dmv\n",
+ size += sysfs_emit_at(buf, size, "VDDGFX_OFFSET: %7dmv %10dmv\n",
min_value, max_value);
}
break;
@@ -1504,29 +1502,26 @@ static int smu_v13_0_0_od_edit_dpm_table(struct smu_context *smu,
}
break;
- case PP_OD_EDIT_VDDC_CURVE:
+ case PP_OD_EDIT_VDDGFX_OFFSET:
if (!smu_v13_0_0_is_od_feature_supported(smu, PP_OD_FEATURE_GFX_VF_CURVE_BIT)) {
- dev_warn(adev->dev, "VF curve setting not supported!\n");
+ dev_warn(adev->dev, "Gfx offset setting not supported!\n");
return -ENOTSUPP;
}
- if (input[0] >= PP_NUM_OD_VF_CURVE_POINTS ||
- input[0] < 0)
- return -EINVAL;
-
smu_v13_0_0_get_od_setting_limits(smu,
PP_OD_FEATURE_GFX_VF_CURVE,
&minimum,
&maximum);
- if (input[1] < minimum ||
- input[1] > maximum) {
+ if (input[0] < minimum ||
+ input[0] > maximum) {
dev_info(adev->dev, "Voltage offset (%ld) must be within [%d, %d]!\n",
- input[1], minimum, maximum);
+ input[0], minimum, maximum);
return -EINVAL;
}
- od_table->OverDriveTable.VoltageOffsetPerZoneBoundary[input[0]] = input[1];
- od_table->OverDriveTable.FeatureCtrlMask |= 1U << PP_OD_FEATURE_GFX_VF_CURVE_BIT;
+ for (i = 0; i < PP_NUM_OD_VF_CURVE_POINTS; i++)
+ od_table->OverDriveTable.VoltageOffsetPerZoneBoundary[i] = input[0];
+ od_table->OverDriveTable.FeatureCtrlMask |= BIT(PP_OD_FEATURE_GFX_VF_CURVE_BIT);
break;
case PP_OD_RESTORE_DEFAULT_TABLE:
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
index 94ef5b4d116d..b9b3bf41eed3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
@@ -1284,16 +1284,14 @@ static int smu_v13_0_7_print_clk_levels(struct smu_context *smu,
od_table->OverDriveTable.UclkFmax);
break;
- case SMU_OD_VDDC_CURVE:
+ case SMU_OD_VDDGFX_OFFSET:
if (!smu_v13_0_7_is_od_feature_supported(smu,
PP_OD_FEATURE_GFX_VF_CURVE_BIT))
break;
- size += sysfs_emit_at(buf, size, "OD_VDDC_CURVE:\n");
- for (i = 0; i < PP_NUM_OD_VF_CURVE_POINTS; i++)
- size += sysfs_emit_at(buf, size, "%d: %dmv\n",
- i,
- od_table->OverDriveTable.VoltageOffsetPerZoneBoundary[i]);
+ size += sysfs_emit_at(buf, size, "OD_VDDGFX_OFFSET:\n");
+ size += sysfs_emit_at(buf, size, "%dmV\n",
+ od_table->OverDriveTable.VoltageOffsetPerZoneBoundary[0]);
break;
case SMU_OD_RANGE:
@@ -1335,7 +1333,7 @@ static int smu_v13_0_7_print_clk_levels(struct smu_context *smu,
PP_OD_FEATURE_GFX_VF_CURVE,
&min_value,
&max_value);
- size += sysfs_emit_at(buf, size, "VDDC_CURVE: %7dmv %10dmv\n",
+ size += sysfs_emit_at(buf, size, "VDDGFX_OFFSET: %7dmv %10dmv\n",
min_value, max_value);
}
break;
@@ -1484,29 +1482,26 @@ static int smu_v13_0_7_od_edit_dpm_table(struct smu_context *smu,
}
break;
- case PP_OD_EDIT_VDDC_CURVE:
+ case PP_OD_EDIT_VDDGFX_OFFSET:
if (!smu_v13_0_7_is_od_feature_supported(smu, PP_OD_FEATURE_GFX_VF_CURVE_BIT)) {
- dev_warn(adev->dev, "VF curve setting not supported!\n");
+ dev_warn(adev->dev, "Gfx offset setting not supported!\n");
return -ENOTSUPP;
}
- if (input[0] >= PP_NUM_OD_VF_CURVE_POINTS ||
- input[0] < 0)
- return -EINVAL;
-
smu_v13_0_7_get_od_setting_limits(smu,
PP_OD_FEATURE_GFX_VF_CURVE,
&minimum,
&maximum);
- if (input[1] < minimum ||
- input[1] > maximum) {
+ if (input[0] < minimum ||
+ input[0] > maximum) {
dev_info(adev->dev, "Voltage offset (%ld) must be within [%d, %d]!\n",
- input[1], minimum, maximum);
+ input[0], minimum, maximum);
return -EINVAL;
}
- od_table->OverDriveTable.VoltageOffsetPerZoneBoundary[input[0]] = input[1];
- od_table->OverDriveTable.FeatureCtrlMask |= 1U << PP_OD_FEATURE_GFX_VF_CURVE_BIT;
+ for (i = 0; i < PP_NUM_OD_VF_CURVE_POINTS; i++)
+ od_table->OverDriveTable.VoltageOffsetPerZoneBoundary[i] = input[0];
+ od_table->OverDriveTable.FeatureCtrlMask |= BIT(PP_OD_FEATURE_GFX_VF_CURVE_BIT);
break;
case PP_OD_RESTORE_DEFAULT_TABLE:
--
GitLab
From 8bad128720ebc69e37f1c66767fb276088ef4fa7 Mon Sep 17 00:00:00 2001
From: Evan Quan <evan.quan@amd.com>
Date: Wed, 16 Aug 2023 14:51:19 +0800
Subject: [PATCH] drm/amd/pm: fulfill the support for SMU13 `pp_dpm_dcefclk`
interface
Fulfill the incomplete SMU13 `pp_dpm_dcefclk` implementation.
Reported-by: Guan Yu <guan.yu@amd.com>
Signed-off-by: Evan Quan <evan.quan@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
.../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 27 +++++++++++++++++++
.../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 27 +++++++++++++++++++
2 files changed, 54 insertions(+)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index bd0d5f027cac..5fdb2b3c042a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -176,6 +176,7 @@ static struct cmn2asic_mapping smu_v13_0_0_clk_map[SMU_CLK_COUNT] = {
CLK_MAP(VCLK1, PPCLK_VCLK_1),
CLK_MAP(DCLK, PPCLK_DCLK_0),
CLK_MAP(DCLK1, PPCLK_DCLK_1),
+ CLK_MAP(DCEFCLK, PPCLK_DCFCLK),
};
static struct cmn2asic_mapping smu_v13_0_0_feature_mask_map[SMU_FEATURE_COUNT] = {
@@ -707,6 +708,22 @@ static int smu_v13_0_0_set_default_dpm_table(struct smu_context *smu)
pcie_table->num_of_link_levels++;
}
+ /* dcefclk dpm table setup */
+ dpm_table = &dpm_context->dpm_tables.dcef_table;
+ if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_DCN_BIT)) {
+ ret = smu_v13_0_set_single_dpm_table(smu,
+ SMU_DCEFCLK,
+ dpm_table);
+ if (ret)
+ return ret;
+ } else {
+ dpm_table->count = 1;
+ dpm_table->dpm_levels[0].value = smu->smu_table.boot_values.dcefclk / 100;
+ dpm_table->dpm_levels[0].enabled = true;
+ dpm_table->min = dpm_table->dpm_levels[0].value;
+ dpm_table->max = dpm_table->dpm_levels[0].value;
+ }
+
return 0;
}
@@ -794,6 +811,9 @@ static int smu_v13_0_0_get_smu_metrics_data(struct smu_context *smu,
case METRICS_CURR_FCLK:
*value = metrics->CurrClock[PPCLK_FCLK];
break;
+ case METRICS_CURR_DCEFCLK:
+ *value = metrics->CurrClock[PPCLK_DCFCLK];
+ break;
case METRICS_AVERAGE_GFXCLK:
if (metrics->AverageGfxActivity <= SMU_13_0_0_BUSY_THRESHOLD)
*value = metrics->AverageGfxclkFrequencyPostDs;
@@ -1047,6 +1067,9 @@ static int smu_v13_0_0_get_current_clk_freq_by_table(struct smu_context *smu,
case PPCLK_DCLK_1:
member_type = METRICS_AVERAGE_DCLK1;
break;
+ case PPCLK_DCFCLK:
+ member_type = METRICS_CURR_DCEFCLK;
+ break;
default:
return -EINVAL;
}
@@ -1196,6 +1219,9 @@ static int smu_v13_0_0_print_clk_levels(struct smu_context *smu,
case SMU_DCLK1:
single_dpm_table = &(dpm_context->dpm_tables.dclk_table);
break;
+ case SMU_DCEFCLK:
+ single_dpm_table = &(dpm_context->dpm_tables.dcef_table);
+ break;
default:
break;
}
@@ -1209,6 +1235,7 @@ static int smu_v13_0_0_print_clk_levels(struct smu_context *smu,
case SMU_VCLK1:
case SMU_DCLK:
case SMU_DCLK1:
+ case SMU_DCEFCLK:
ret = smu_v13_0_0_get_current_clk_freq_by_table(smu, clk_type, &curr_freq);
if (ret) {
dev_err(smu->adev->dev, "Failed to get current clock freq!");
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
index b9b3bf41eed3..12949928e285 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
@@ -147,6 +147,7 @@ static struct cmn2asic_mapping smu_v13_0_7_clk_map[SMU_CLK_COUNT] = {
CLK_MAP(VCLK1, PPCLK_VCLK_1),
CLK_MAP(DCLK, PPCLK_DCLK_0),
CLK_MAP(DCLK1, PPCLK_DCLK_1),
+ CLK_MAP(DCEFCLK, PPCLK_DCFCLK),
};
static struct cmn2asic_mapping smu_v13_0_7_feature_mask_map[SMU_FEATURE_COUNT] = {
@@ -696,6 +697,22 @@ static int smu_v13_0_7_set_default_dpm_table(struct smu_context *smu)
pcie_table->num_of_link_levels++;
}
+ /* dcefclk dpm table setup */
+ dpm_table = &dpm_context->dpm_tables.dcef_table;
+ if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_DCN_BIT)) {
+ ret = smu_v13_0_set_single_dpm_table(smu,
+ SMU_DCEFCLK,
+ dpm_table);
+ if (ret)
+ return ret;
+ } else {
+ dpm_table->count = 1;
+ dpm_table->dpm_levels[0].value = smu->smu_table.boot_values.dcefclk / 100;
+ dpm_table->dpm_levels[0].enabled = true;
+ dpm_table->min = dpm_table->dpm_levels[0].value;
+ dpm_table->max = dpm_table->dpm_levels[0].value;
+ }
+
return 0;
}
@@ -777,6 +794,9 @@ static int smu_v13_0_7_get_smu_metrics_data(struct smu_context *smu,
case METRICS_CURR_FCLK:
*value = metrics->CurrClock[PPCLK_FCLK];
break;
+ case METRICS_CURR_DCEFCLK:
+ *value = metrics->CurrClock[PPCLK_DCFCLK];
+ break;
case METRICS_AVERAGE_GFXCLK:
*value = metrics->AverageGfxclkFrequencyPreDs;
break;
@@ -1027,6 +1047,9 @@ static int smu_v13_0_7_get_current_clk_freq_by_table(struct smu_context *smu,
case PPCLK_DCLK_1:
member_type = METRICS_CURR_DCLK1;
break;
+ case PPCLK_DCFCLK:
+ member_type = METRICS_CURR_DCEFCLK;
+ break;
default:
return -EINVAL;
}
@@ -1176,6 +1199,9 @@ static int smu_v13_0_7_print_clk_levels(struct smu_context *smu,
case SMU_DCLK1:
single_dpm_table = &(dpm_context->dpm_tables.dclk_table);
break;
+ case SMU_DCEFCLK:
+ single_dpm_table = &(dpm_context->dpm_tables.dcef_table);
+ break;
default:
break;
}
@@ -1189,6 +1215,7 @@ static int smu_v13_0_7_print_clk_levels(struct smu_context *smu,
case SMU_VCLK1:
case SMU_DCLK:
case SMU_DCLK1:
+ case SMU_DCEFCLK:
ret = smu_v13_0_7_get_current_clk_freq_by_table(smu, clk_type, &curr_freq);
if (ret) {
dev_err(smu->adev->dev, "Failed to get current clock freq!");
--
GitLab
From 3a2fb905145e76e4bbb32e90e0c6cd532dafb1b0 Mon Sep 17 00:00:00 2001
From: Evan Quan <evan.quan@amd.com>
Date: Mon, 14 Aug 2023 10:16:27 +0800
Subject: [PATCH] Revert "drm/amd/pm: disable the SMU13 OD feature support
temporarily"
This reverts commit 3592cc20beeece83db4c50a0f400e2dd15139de9.
The enablement for the new OD mechanism completed. Also, the support for
fan control related OD feature has been added via this new mechanism.
Thus, it is time to bring back the SMU13 OD support.
Signed-off-by: Evan Quan <evan.quan@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
.../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 18 +++---------------
.../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 12 +++---------
2 files changed, 6 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index c48f81450d24..093962a37688 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -348,13 +348,10 @@ static int smu_v13_0_0_check_powerplay_table(struct smu_context *smu)
table_context->power_play_table;
struct smu_baco_context *smu_baco = &smu->smu_baco;
PPTable_t *pptable = smu->smu_table.driver_pptable;
-#if 0
- PPTable_t *pptable = smu->smu_table.driver_pptable;
const OverDriveLimits_t * const overdrive_upperlimits =
&pptable->SkuTable.OverDriveLimitsBasicMax;
const OverDriveLimits_t * const overdrive_lowerlimits =
&pptable->SkuTable.OverDriveLimitsMin;
-#endif
if (powerplay_table->platform_caps & SMU_13_0_0_PP_PLATFORM_CAP_HARDWAREDC)
smu->dc_controlled_by_gpio = true;
@@ -366,27 +363,18 @@ static int smu_v13_0_0_check_powerplay_table(struct smu_context *smu)
if (powerplay_table->platform_caps & SMU_13_0_0_PP_PLATFORM_CAP_MACO)
smu_baco->maco_support = true;
- /*
- * We are in the transition to a new OD mechanism.
- * Disable the OD feature support for SMU13 temporarily.
- * TODO: get this reverted when new OD mechanism online
- */
-#if 0
if (!overdrive_lowerlimits->FeatureCtrlMask ||
!overdrive_upperlimits->FeatureCtrlMask)
smu->od_enabled = false;
+ table_context->thermal_controller_type =
+ powerplay_table->thermal_controller_type;
+
/*
* Instead of having its own buffer space and get overdrive_table copied,
* smu->od_settings just points to the actual overdrive_table
*/
smu->od_settings = &powerplay_table->overdrive_table;
-#else
- smu->od_enabled = false;
-#endif
-
- table_context->thermal_controller_type =
- powerplay_table->thermal_controller_type;
smu->adev->pm.no_fan =
!(pptable->SkuTable.FeaturesToRun[0] & (1 << FEATURE_FAN_CONTROL_BIT));
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
index 99bc449799a6..430ad1b05ba3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
@@ -338,12 +338,10 @@ static int smu_v13_0_7_check_powerplay_table(struct smu_context *smu)
struct smu_baco_context *smu_baco = &smu->smu_baco;
PPTable_t *smc_pptable = table_context->driver_pptable;
BoardTable_t *BoardTable = &smc_pptable->BoardTable;
-#if 0
const OverDriveLimits_t * const overdrive_upperlimits =
&smc_pptable->SkuTable.OverDriveLimitsBasicMax;
const OverDriveLimits_t * const overdrive_lowerlimits =
&smc_pptable->SkuTable.OverDriveLimitsMin;
-#endif
if (powerplay_table->platform_caps & SMU_13_0_7_PP_PLATFORM_CAP_HARDWAREDC)
smu->dc_controlled_by_gpio = true;
@@ -355,22 +353,18 @@ static int smu_v13_0_7_check_powerplay_table(struct smu_context *smu)
if (smu_baco->platform_support && (BoardTable->HsrEnabled || BoardTable->VddqOffEnabled))
smu_baco->maco_support = true;
-#if 0
if (!overdrive_lowerlimits->FeatureCtrlMask ||
!overdrive_upperlimits->FeatureCtrlMask)
smu->od_enabled = false;
+ table_context->thermal_controller_type =
+ powerplay_table->thermal_controller_type;
+
/*
* Instead of having its own buffer space and get overdrive_table copied,
* smu->od_settings just points to the actual overdrive_table
*/
smu->od_settings = &powerplay_table->overdrive_table;
-#else
- smu->od_enabled = false;
-#endif
-
- table_context->thermal_controller_type =
- powerplay_table->thermal_controller_type;
return 0;
}
--
GitLab
From 072a8dc3b5260ba08ba2e66036c2c63abd77df52 Mon Sep 17 00:00:00 2001
From: Lijo Lazar <lijo.lazar@amd.com>
Date: Thu, 24 Aug 2023 17:25:51 +0530
Subject: [PATCH] drm/amd/pm: Fix clock reporting for SMUv13.0.6
On SMU v13.0.6, effective clocks are reported by FW which won't exactly
match with DPM level. Report the current clock based on the values
matching closest to the effective clock. Also, when deep sleep is
applied to a clock, report it with a special level "S:" as in sample
clock levels below
S: 19Mhz *
0: 615Mhz
1: 800Mhz
2: 888Mhz
3: 1000Mhz
Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Evan Quan <evan.quan@amd.com>
---
.../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 159 +++++++-----------
1 file changed, 62 insertions(+), 97 deletions(-)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index c2308783053c..29e1cada7667 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -91,6 +91,8 @@
#define PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE__SHIFT 0x5
#define LINK_SPEED_MAX 4
+#define SMU_13_0_6_DSCLK_THRESHOLD 100
+
static const struct cmn2asic_msg_mapping smu_v13_0_6_message_map[SMU_MSG_MAX_COUNT] = {
MSG_MAP(TestMessage, PPSMC_MSG_TestMessage, 0),
MSG_MAP(GetSmuVersion, PPSMC_MSG_GetSmuVersion, 1),
@@ -783,13 +785,61 @@ static int smu_v13_0_6_get_current_clk_freq_by_table(struct smu_context *smu,
return smu_v13_0_6_get_smu_metrics_data(smu, member_type, value);
}
+static int smu_v13_0_6_print_clks(struct smu_context *smu, char *buf,
+ struct smu_13_0_dpm_table *single_dpm_table,
+ uint32_t curr_clk, const char *clk_name)
+{
+ struct pp_clock_levels_with_latency clocks;
+ int i, ret, size = 0, level = -1;
+ uint32_t clk1, clk2;
+
+ ret = smu_v13_0_6_get_clk_table(smu, &clocks, single_dpm_table);
+ if (ret) {
+ dev_err(smu->adev->dev, "Attempt to get %s clk levels failed!",
+ clk_name);
+ return ret;
+ }
+
+ if (!clocks.num_levels)
+ return -EINVAL;
+
+ if (curr_clk < SMU_13_0_6_DSCLK_THRESHOLD) {
+ size = sysfs_emit_at(buf, size, "S: %uMhz *\n", curr_clk);
+ for (i = 0; i < clocks.num_levels; i++)
+ size += sysfs_emit_at(buf, size, "%d: %uMhz\n", i,
+ clocks.data[i].clocks_in_khz /
+ 1000);
+
+ } else {
+ if ((clocks.num_levels == 1) ||
+ (curr_clk < (clocks.data[0].clocks_in_khz / 1000)))
+ level = 0;
+ for (i = 0; i < clocks.num_levels; i++) {
+ clk1 = clocks.data[i].clocks_in_khz / 1000;
+
+ if (i < (clocks.num_levels - 1))
+ clk2 = clocks.data[i + 1].clocks_in_khz / 1000;
+
+ if (curr_clk >= clk1 && curr_clk < clk2) {
+ level = (curr_clk - clk1) <= (clk2 - curr_clk) ?
+ i :
+ i + 1;
+ }
+
+ size += sysfs_emit_at(buf, size, "%d: %uMhz %s\n", i,
+ clk1, (level == i) ? "*" : "");
+ }
+ }
+
+ return size;
+}
+
static int smu_v13_0_6_print_clk_levels(struct smu_context *smu,
enum smu_clk_type type, char *buf)
{
- int i, now, size = 0;
+ int now, size = 0;
int ret = 0;
struct smu_umd_pstate_table *pstate_table = &smu->pstate_table;
- struct pp_clock_levels_with_latency clocks;
struct smu_13_0_dpm_table *single_dpm_table;
struct smu_dpm_context *smu_dpm = &smu->smu_dpm;
struct smu_13_0_dpm_context *dpm_context = NULL;
@@ -852,26 +902,9 @@ static int smu_v13_0_6_print_clk_levels(struct smu_context *smu,
}
single_dpm_table = &(dpm_context->dpm_tables.uclk_table);
- ret = smu_v13_0_6_get_clk_table(smu, &clocks, single_dpm_table);
- if (ret) {
- dev_err(smu->adev->dev,
- "Attempt to get memory clk levels Failed!");
- return ret;
- }
- for (i = 0; i < clocks.num_levels; i++)
- size += sysfs_emit_at(
- buf, size, "%d: %uMhz %s\n", i,
- clocks.data[i].clocks_in_khz / 1000,
- (clocks.num_levels == 1) ?
- "*" :
- (smu_v13_0_6_freqs_in_same_level(
- clocks.data[i].clocks_in_khz /
- 1000,
- now) ?
- "*" :
- ""));
- break;
+ return smu_v13_0_6_print_clks(smu, buf, single_dpm_table, now,
+ "mclk");
case SMU_SOCCLK:
ret = smu_v13_0_6_get_current_clk_freq_by_table(smu, SMU_SOCCLK,
@@ -883,26 +916,9 @@ static int smu_v13_0_6_print_clk_levels(struct smu_context *smu,
}
single_dpm_table = &(dpm_context->dpm_tables.soc_table);
- ret = smu_v13_0_6_get_clk_table(smu, &clocks, single_dpm_table);
- if (ret) {
- dev_err(smu->adev->dev,
- "Attempt to get socclk levels Failed!");
- return ret;
- }
- for (i = 0; i < clocks.num_levels; i++)
- size += sysfs_emit_at(
- buf, size, "%d: %uMhz %s\n", i,
- clocks.data[i].clocks_in_khz / 1000,
- (clocks.num_levels == 1) ?
- "*" :
- (smu_v13_0_6_freqs_in_same_level(
- clocks.data[i].clocks_in_khz /
- 1000,
- now) ?
- "*" :
- ""));
- break;
+ return smu_v13_0_6_print_clks(smu, buf, single_dpm_table, now,
+ "socclk");
case SMU_FCLK:
ret = smu_v13_0_6_get_current_clk_freq_by_table(smu, SMU_FCLK,
@@ -914,26 +930,9 @@ static int smu_v13_0_6_print_clk_levels(struct smu_context *smu,
}
single_dpm_table = &(dpm_context->dpm_tables.fclk_table);
- ret = smu_v13_0_6_get_clk_table(smu, &clocks, single_dpm_table);
- if (ret) {
- dev_err(smu->adev->dev,
- "Attempt to get fclk levels Failed!");
- return ret;
- }
- for (i = 0; i < single_dpm_table->count; i++)
- size += sysfs_emit_at(
- buf, size, "%d: %uMhz %s\n", i,
- single_dpm_table->dpm_levels[i].value,
- (clocks.num_levels == 1) ?
- "*" :
- (smu_v13_0_6_freqs_in_same_level(
- clocks.data[i].clocks_in_khz /
- 1000,
- now) ?
- "*" :
- ""));
- break;
+ return smu_v13_0_6_print_clks(smu, buf, single_dpm_table, now,
+ "fclk");
case SMU_VCLK:
ret = smu_v13_0_6_get_current_clk_freq_by_table(smu, SMU_VCLK,
@@ -945,26 +944,9 @@ static int smu_v13_0_6_print_clk_levels(struct smu_context *smu,
}
single_dpm_table = &(dpm_context->dpm_tables.vclk_table);
- ret = smu_v13_0_6_get_clk_table(smu, &clocks, single_dpm_table);
- if (ret) {
- dev_err(smu->adev->dev,
- "Attempt to get vclk levels Failed!");
- return ret;
- }
- for (i = 0; i < single_dpm_table->count; i++)
- size += sysfs_emit_at(
- buf, size, "%d: %uMhz %s\n", i,
- single_dpm_table->dpm_levels[i].value,
- (clocks.num_levels == 1) ?
- "*" :
- (smu_v13_0_6_freqs_in_same_level(
- clocks.data[i].clocks_in_khz /
- 1000,
- now) ?
- "*" :
- ""));
- break;
+ return smu_v13_0_6_print_clks(smu, buf, single_dpm_table, now,
+ "vclk");
case SMU_DCLK:
ret = smu_v13_0_6_get_current_clk_freq_by_table(smu, SMU_DCLK,
@@ -976,26 +958,9 @@ static int smu_v13_0_6_print_clk_levels(struct smu_context *smu,
}
single_dpm_table = &(dpm_context->dpm_tables.dclk_table);
- ret = smu_v13_0_6_get_clk_table(smu, &clocks, single_dpm_table);
- if (ret) {
- dev_err(smu->adev->dev,
- "Attempt to get dclk levels Failed!");
- return ret;
- }
- for (i = 0; i < single_dpm_table->count; i++)
- size += sysfs_emit_at(
- buf, size, "%d: %uMhz %s\n", i,
- single_dpm_table->dpm_levels[i].value,
- (clocks.num_levels == 1) ?
- "*" :
- (smu_v13_0_6_freqs_in_same_level(
- clocks.data[i].clocks_in_khz /
- 1000,
- now) ?
- "*" :
- ""));
- break;
+ return smu_v13_0_6_print_clks(smu, buf, single_dpm_table, now,
+ "dclk");
default:
break;
--
GitLab