diff --git a/PKGBUILD b/PKGBUILD index eb839d8..bc06952 100644 --- a/PKGBUILD +++ b/PKGBUILD @@ -500,6 +500,7 @@ case $_basever in 0006-add-acs-overrides_iommu.patch 0007-v5.13-fsync.patch 0007-v5.13-futex2_interface.patch + 0007-v5.13-futex_waitv.patch 0007-v5.13-winesync.patch 0008-5.13-bcachefs.patch 0009-glitched-ondemand-bmq.patch @@ -530,6 +531,7 @@ case $_basever in '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' '89d837bfea3515504b1c99fc881ebdc4f15e2999558127a263e795fc69408a39' '9ec679871cba674cf876ba836cde969296ae5034bcc10e1ec39b372e6e07aab0' + '0e3473c19e5513bee886f03cf2476f746d8b5b2fbc0841c9d60d609b16a97c14' '034d12a73b507133da2c69a34d61efd2f6b6618549650aa26d748142d22002e1' 'b0004bc559653fd8719b8adcfa1ead1075db3425d30d7d7adb8cbc6296386a8f' '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' @@ -560,6 +562,7 @@ case $_basever in 0006-add-acs-overrides_iommu.patch 0007-v5.14-fsync.patch 0007-v5.14-futex2_interface.patch + 0007-v5.14-futex_waitv.patch 0007-v5.14-winesync.patch #0008-5.14-bcachefs.patch 0009-glitched-ondemand-bmq.patch @@ -586,6 +589,7 @@ case $_basever in '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' 'aa67e81a27d9062e463594acb91eca6dd13388f23cbe53ca56298f9dba61cc10' 'efe5e21706fdf64559ead866c85a5d88c5c3f743d814410df3810ca61cc5b966' + '5742277f41f22bf29fa9742562946b8a01377f8a22adb42ceed3607541c1d5b6' '034d12a73b507133da2c69a34d61efd2f6b6618549650aa26d748142d22002e1' '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' 'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911' @@ -614,6 +618,7 @@ case $_basever in 0006-add-acs-overrides_iommu.patch 0007-v5.15-fsync.patch #0007-v5.15-futex2_interface.patch + 0007-v5.15-futex_waitv.patch 0007-v5.15-winesync.patch #0008-5.14-bcachefs.patch #0009-glitched-ondemand-bmq.patch @@ -640,6 +645,7 @@ case $_basever in '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' '6c4f0099896f69e56ebd8c9eac266ac8ad993acecd50945e0e84ef6f95f9ddca' #'efe5e21706fdf64559ead866c85a5d88c5c3f743d814410df3810ca61cc5b966' + 'c8f7c50d9b1418ba22b5ca735c47111a162be416109714d26a674162e5b2cb97' '034d12a73b507133da2c69a34d61efd2f6b6618549650aa26d748142d22002e1' #'9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' #'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911' diff --git a/customization.cfg b/customization.cfg index 7c5fdad..c2ed5c3 100644 --- a/customization.cfg +++ b/customization.cfg @@ -154,6 +154,11 @@ _fsync="true" # https://gitlab.collabora.com/tonyk/linux/-/tree/futex2-dev _futex2="true" +# Set to "true" to enable backported patches to add support for the futex_waitv() syscall, a new interface for fsync. It will appear in mainline at Linux 5.16 release and requires a wine/proton with builtin support for it. It's expected to be available in Valve Proton 6.3 stable soon - https://github.com/ValveSoftware/wine/pull/128 +# !! Disables fsync/futex2 interfaces support !! +# https://github.com/andrealmeid/futex_waitv_patches +_futex_waitv="false" + # Set to "true" to enable support for winesync, an experimental replacement for esync - requires patched wine - https://repo.or.cz/linux/zf.git/shortlog/refs/heads/winesync # ! Can't be used on multiple kernels installed side-by-side, which will require https://aur.archlinux.org/packages/winesync-dkms/ instead of this option ! _winesync="false" diff --git a/linux-tkg-config/prepare b/linux-tkg-config/prepare index d220b0a..3bb897e 100644 --- a/linux-tkg-config/prepare +++ b/linux-tkg-config/prepare @@ -571,7 +571,7 @@ _tkg_srcprep() { _enable "CRYPTO_LZ4" "CRYPTO_LZ4HC" "LZ4_COMPRESS" "LZ4HC_COMPRESS" "ZSWAP_COMPRESSOR_DEFAULT_LZ4" "CMDLINE_BOOL" "CONFIG_BLK_DEV_LOOP" _disable "DEBUG_FORCE_FUNCTION_ALIGN_64B" scripts/config --set-str "ZSWAP_COMPRESSOR_DEFAULT" "lz4" - if [ "$_futex2" = "true" ] && [ "$_basever" != "54" ] && [ "$_basever" != "57" ] && [ "$_basever" != "58" ] && [ "$_basever" != "59" ]; then + if [ "$_futex2" = "true" ] && [ "$_futex_waitv" != "true" ] && [ "$_basever" != "54" ] && [ "$_basever" != "57" ] && [ "$_basever" != "58" ] && [ "$_basever" != "59" ]; then sed -i -e 's/# CONFIG_EXPERT is not set/CONFIG_EXPERT=y/' ./.config echo -e "\r# start of config expert\r # CONFIG_DEBUG_RSEQ is not set\r @@ -1140,6 +1140,25 @@ CONFIG_DEBUG_INFO_BTF_MODULES=y\r fi fi + # futex_waitv support + tkgpatch="$srcdir/0007-v${_basekernel}-futex_waitv.patch" + if [ -e "$tkgpatch" ]; then + if [ -z "$_futex_waitv" ]; then + plain "" + plain "Enable support for futex_waitv, backported patches for fsync from 5.16 Kernel" + plain "! Will disable fsync/futex2 patchsets !" + plain "https://github.com/andrealmeid/futex_waitv_patches" + plain "https://github.com/ValveSoftware/wine/pull/128" + read -rp "`echo $' > N/y : '`" CONDITION9; + fi + if [[ "$CONDITION9" =~ [yY] ]] || [ "$_futex_waitv" = "true" ]; then + _msg="Patching futex_waitv support" + _tkg_patcher + _fsync="false" + _futex2="false" + fi + fi + # fsync support tkgpatch="$srcdir/0007-v${_basekernel}-fsync.patch" if [ -e "$tkgpatch" ]; then @@ -1147,9 +1166,9 @@ CONFIG_DEBUG_INFO_BTF_MODULES=y\r plain "" plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+" plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305" - read -rp "`echo $' > N/y : '`" CONDITION9; + read -rp "`echo $' > N/y : '`" CONDITION10; fi - if [[ "$CONDITION9" =~ [yY] ]] || [ "$_fsync" = "true" ]; then + if [[ "$CONDITION10" =~ [yY] ]] || [ "$_fsync" = "true" ]; then _msg="Patching Fsync support" _tkg_patcher fi @@ -1164,9 +1183,9 @@ CONFIG_DEBUG_INFO_BTF_MODULES=y\r plain "Can be enabled alongside regular fsync patchset to have a fallback option" plain "https://gitlab.collabora.com/tonyk/linux/-/tree/futex2-dev" plain "https://github.com/ValveSoftware/Proton/issues/4568" - read -rp "`echo $' > N/y : '`" CONDITION10; + read -rp "`echo $' > N/y : '`" CONDITION11; fi - if [[ "$CONDITION10" =~ [yY] ]] || [ "$_futex2" = "true" ]; then + if [[ "$CONDITION11" =~ [yY] ]] || [ "$_futex2" = "true" ]; then _msg="Patching futex2 support" _tkg_patcher _enable "FUTEX2" diff --git a/linux-tkg-patches/5.13/0007-v5.13-futex_waitv.patch b/linux-tkg-patches/5.13/0007-v5.13-futex_waitv.patch new file mode 100644 index 0000000..b4f4117 --- /dev/null +++ b/linux-tkg-patches/5.13/0007-v5.13-futex_waitv.patch @@ -0,0 +1,536 @@ +From 4901e29e3c0237c52eadd2c82deb9bd6e7add5ac Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 23 Sep 2021 14:11:05 -0300 +Subject: [PATCH 1/2] futex: Implement sys_futex_waitv() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add support to wait on multiple futexes. This is the interface +implemented by this syscall: + +futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes, + unsigned int flags, struct timespec *timeout, clockid_t clockid) + +struct futex_waitv { + __u64 val; + __u64 uaddr; + __u32 flags; + __u32 __reserved; +}; + +Given an array of struct futex_waitv, wait on each uaddr. The thread +wakes if a futex_wake() is performed at any uaddr. The syscall returns +immediately if any waiter has *uaddr != val. *timeout is an optional +absolute timeout value for the operation. This syscall supports only +64bit sized timeout structs. The flags argument of the syscall should be +empty, but it can be used for future extensions. Flags for shared +futexes, sizes, etc. should be used on the individual flags of each +waiter. + +__reserved is used for explicit padding and should be 0, but it might be +used for future extensions. If the userspace uses 32-bit pointers, it +should make sure to explicitly cast it when assigning to waitv::uaddr. + +Returns the array index of one of the woken futexes. There’s no given +information of how many were woken, or any particular attribute of it +(if it’s the first woken, if it is of the smaller index...). + +Signed-off-by: André Almeida +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20210923171111.300673-17-andrealmeid@collabora.com +--- + include/linux/syscalls.h | 6 + + include/uapi/asm-generic/unistd.h | 5 +- + include/uapi/linux/futex.h | 26 +++ + kernel/futex.c | 334 ++++++++++++++++++++++++++++++ + kernel/sys_ni.c | 1 + + 5 files changed, 371 insertions(+), 1 deletion(-) + +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index 050511e8f1f8..8390482cf082 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -58,6 +58,7 @@ struct mq_attr; + struct compat_stat; + struct old_timeval32; + struct robust_list_head; ++struct futex_waitv; + struct getcpu_cache; + struct old_linux_dirent; + struct perf_event_attr; +@@ -623,6 +624,11 @@ asmlinkage long sys_get_robust_list(int pid, + asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, + size_t len); + ++asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, ++ unsigned int nr_futexes, unsigned int flags, ++ struct __kernel_timespec __user *timeout, clockid_t clockid); ++ ++ + /* kernel/hrtimer.c */ + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, + struct __kernel_timespec __user *rmtp); +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index d2a942086fcb..3f55ac23cea9 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -872,8 +872,11 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) + #define __NR_landlock_restrict_self 446 + __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) + ++#define __NR_futex_waitv 449 ++__SYSCALL(__NR_futex_waitv, sys_futex_waitv) ++ + #undef __NR_syscalls +-#define __NR_syscalls 447 ++#define __NR_syscalls 450 + + /* + * 32 bit systems traditionally used different +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e..1666f5e4b837 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -41,6 +41,32 @@ + #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) + ++ /* ++ * Flags to specify the bit length of the futex word for futex2 syscalls. ++ * Currently, only 32 is supported. ++ */ ++#define FUTEX_32 2 ++ ++/* ++ * Max numbers of elements in a futex_waitv array ++ */ ++#define FUTEX_WAITV_MAX 128 ++ ++/** ++ * struct futex_waitv - A waiter for vectorized wait ++ * @val: Expected value at uaddr ++ * @uaddr: User address to wait on ++ * @flags: Flags for this waiter ++ * @__reserved: Reserved member to preserve data alignment. Should be 0. ++ */ ++struct futex_waitv { ++ __u64 val; ++ __u64 uaddr; ++ __u32 flags; ++ __u32 __reserved; ++}; ++ ++ + /* + * Support for robust futexes: the kernel cleans up held futexes at + * thread exit time. +diff --git a/kernel/futex.c b/kernel/futex.c +index 408cad5e8968..d7dc0bd9379c 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -227,6 +227,18 @@ static const struct futex_q futex_q_init = { + .bitset = FUTEX_BITSET_MATCH_ANY + }; + ++/** ++ * struct futex_vector - Auxiliary struct for futex_waitv() ++ * @w: Userspace provided data ++ * @q: Kernel side data ++ * ++ * Struct used to build an array with all data need for futex_waitv() ++ */ ++struct futex_vector { ++ struct futex_waitv w; ++ struct futex_q q; ++}; ++ + /* + * Hash buckets are shared by all the futex_keys that hash to the same + * location. Each key may have multiple futex_q structures, one for each task +@@ -3962,6 +3974,328 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + } + #endif /* CONFIG_COMPAT */ + ++/* Mask of available flags for each futex in futex_waitv list */ ++#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) ++ ++/** ++ * futex_parse_waitv - Parse a waitv array from userspace ++ * @futexv: Kernel side list of waiters to be filled ++ * @uwaitv: Userspace list to be parsed ++ * @nr_futexes: Length of futexv ++ * ++ * Return: Error code on failure, 0 on success ++ */ ++static int futex_parse_waitv(struct futex_vector *futexv, ++ struct futex_waitv __user *uwaitv, ++ unsigned int nr_futexes) ++{ ++ struct futex_waitv aux; ++ unsigned int i; ++ ++ for (i = 0; i < nr_futexes; i++) { ++ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) ++ return -EFAULT; ++ ++ if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved) ++ return -EINVAL; ++ ++ if (!(aux.flags & FUTEX_32)) ++ return -EINVAL; ++ ++ futexv[i].w.flags = aux.flags; ++ futexv[i].w.val = aux.val; ++ futexv[i].w.uaddr = aux.uaddr; ++ futexv[i].q = futex_q_init; ++ } ++ ++ return 0; ++} ++ ++/** ++ * unqueue_multiple - Remove various futexes from their hash bucket ++ * @v: The list of futexes to unqueue ++ * @count: Number of futexes in the list ++ * ++ * Helper to unqueue a list of futexes. This can't fail. ++ * ++ * Return: ++ * - >=0 - Index of the last futex that was awoken; ++ * - -1 - No futex was awoken ++ */ ++static int unqueue_multiple(struct futex_vector *v, int count) ++{ ++ int ret = -1, i; ++ ++ for (i = 0; i < count; i++) { ++ if (!unqueue_me(&v[i].q)) ++ ret = i; ++ } ++ ++ return ret; ++} ++ ++/** ++ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes ++ * @vs: The futex list to wait on ++ * @count: The size of the list ++ * @woken: Index of the last woken futex, if any. Used to notify the ++ * caller that it can return this index to userspace (return parameter) ++ * ++ * Prepare multiple futexes in a single step and enqueue them. This may fail if ++ * the futex list is invalid or if any futex was already awoken. On success the ++ * task is ready to interruptible sleep. ++ * ++ * Return: ++ * - 1 - One of the futexes was woken by another thread ++ * - 0 - Success ++ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL ++ */ ++static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) ++{ ++ struct futex_hash_bucket *hb; ++ bool retry = false; ++ int ret, i; ++ u32 uval; ++ ++ /* ++ * Enqueuing multiple futexes is tricky, because we need to enqueue ++ * each futex on the list before dealing with the next one to avoid ++ * deadlocking on the hash bucket. But, before enqueuing, we need to ++ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't ++ * lose any wake events, which cannot be done before the get_futex_key ++ * of the next key, because it calls get_user_pages, which can sleep. ++ * Thus, we fetch the list of futexes keys in two steps, by first ++ * pinning all the memory keys in the futex key, and only then we read ++ * each key and queue the corresponding futex. ++ * ++ * Private futexes doesn't need to recalculate hash in retry, so skip ++ * get_futex_key() when retrying. ++ */ ++retry: ++ for (i = 0; i < count; i++) { ++ if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) ++ continue; ++ ++ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), ++ !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), ++ &vs[i].q.key, FUTEX_READ); ++ ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; ++ struct futex_q *q = &vs[i].q; ++ u32 val = (u32)vs[i].w.val; ++ ++ hb = queue_lock(q); ++ ret = get_futex_value_locked(&uval, uaddr); ++ ++ if (!ret && uval == val) { ++ /* ++ * The bucket lock can't be held while dealing with the ++ * next futex. Queue each futex at this moment so hb can ++ * be unlocked. ++ */ ++ queue_me(q, hb); ++ continue; ++ } ++ ++ queue_unlock(hb); ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * Even if something went wrong, if we find out that a futex ++ * was woken, we don't return error and return this index to ++ * userspace ++ */ ++ *woken = unqueue_multiple(vs, i); ++ if (*woken >= 0) ++ return 1; ++ ++ if (ret) { ++ /* ++ * If we need to handle a page fault, we need to do so ++ * without any lock and any enqueued futex (otherwise ++ * we could lose some wakeup). So we do it here, after ++ * undoing all the work done so far. In success, we ++ * retry all the work. ++ */ ++ if (get_user(uval, uaddr)) ++ return -EFAULT; ++ ++ retry = true; ++ goto retry; ++ } ++ ++ if (uval != val) ++ return -EWOULDBLOCK; ++ } ++ ++ return 0; ++} ++ ++/** ++ * futex_sleep_multiple - Check sleeping conditions and sleep ++ * @vs: List of futexes to wait for ++ * @count: Length of vs ++ * @to: Timeout ++ * ++ * Sleep if and only if the timeout hasn't expired and no futex on the list has ++ * been woken up. ++ */ ++static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, ++ struct hrtimer_sleeper *to) ++{ ++ if (to && !to->task) ++ return; ++ ++ for (; count; count--, vs++) { ++ if (!READ_ONCE(vs->q.lock_ptr)) ++ return; ++ } ++ ++ freezable_schedule(); ++} ++ ++/** ++ * futex_wait_multiple - Prepare to wait on and enqueue several futexes ++ * @vs: The list of futexes to wait on ++ * @count: The number of objects ++ * @to: Timeout before giving up and returning to userspace ++ * ++ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function ++ * sleeps on a group of futexes and returns on the first futex that is ++ * wake, or after the timeout has elapsed. ++ * ++ * Return: ++ * - >=0 - Hint to the futex that was awoken ++ * - <0 - On error ++ */ ++int futex_wait_multiple(struct futex_vector *vs, unsigned int count, ++ struct hrtimer_sleeper *to) ++{ ++ int ret, hint = 0; ++ ++ if (to) ++ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); ++ ++ while (1) { ++ ret = futex_wait_multiple_setup(vs, count, &hint); ++ if (ret) { ++ if (ret > 0) { ++ /* A futex was woken during setup */ ++ ret = hint; ++ } ++ return ret; ++ } ++ ++ futex_sleep_multiple(vs, count, to); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ ret = unqueue_multiple(vs, count); ++ if (ret >= 0) ++ return ret; ++ ++ if (to && !to->task) ++ return -ETIMEDOUT; ++ else if (signal_pending(current)) ++ return -ERESTARTSYS; ++ /* ++ * The final case is a spurious wakeup, for ++ * which just retry. ++ */ ++ } ++} ++/* Mask of available flags for each futex in futex_waitv list */ ++#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) ++ ++/** ++ * sys_futex_waitv - Wait on a list of futexes ++ * @waiters: List of futexes to wait on ++ * @nr_futexes: Length of futexv ++ * @flags: Flag for timeout (monotonic/realtime) ++ * @timeout: Optional absolute timeout. ++ * @clockid: Clock to be used for the timeout, realtime or monotonic. ++ * ++ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes ++ * if a futex_wake() is performed at any uaddr. The syscall returns immediately ++ * if any waiter has *uaddr != val. *timeout is an optional timeout value for ++ * the operation. Each waiter has individual flags. The `flags` argument for ++ * the syscall should be used solely for specifying the timeout as realtime, if ++ * needed. Flags for private futexes, sizes, etc. should be used on the ++ * individual flags of each waiter. ++ * ++ * Returns the array index of one of the woken futexes. No further information ++ * is provided: any number of other futexes may also have been woken by the ++ * same event, and if more than one futex was woken, the retrned index may ++ * refer to any one of them. (It is not necessaryily the futex with the ++ * smallest index, nor the one most recently woken, nor...) ++ */ ++ ++SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, ++ unsigned int, nr_futexes, unsigned int, flags, ++ struct __kernel_timespec __user *, timeout, clockid_t, clockid) ++{ ++ struct hrtimer_sleeper to; ++ struct futex_vector *futexv; ++ struct timespec64 ts; ++ ktime_t time; ++ int ret; ++ ++ /* This syscall supports no flags for now */ ++ if (flags) ++ return -EINVAL; ++ ++ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) ++ return -EINVAL; ++ ++ if (timeout) { ++ int flag_clkid = 0, flag_init = 0; ++ ++ if (clockid == CLOCK_REALTIME) { ++ flag_clkid = FLAGS_CLOCKRT; ++ flag_init = FUTEX_CLOCK_REALTIME; ++ } ++ ++ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) ++ return -EINVAL; ++ ++ if (get_timespec64(&ts, timeout)) ++ return -EFAULT; ++ ++ /* ++ * Since there's no opcode for futex_waitv, use ++ * FUTEX_WAIT_BITSET that uses absolute timeout as well ++ */ ++ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); ++ if (ret) ++ return ret; ++ ++ futex_setup_timer(&time, &to, flag_clkid, 0); ++ } ++ ++ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); ++ if (!futexv) ++ return -ENOMEM; ++ ++ ret = futex_parse_waitv(futexv, waiters, nr_futexes); ++ if (!ret) ++ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); ++ ++ if (timeout) { ++ hrtimer_cancel(&to.timer); ++ destroy_hrtimer_on_stack(&to.timer); ++ } ++ ++ kfree(futexv); ++ return ret; ++} ++ + #ifdef CONFIG_COMPAT_32BIT_TIME + SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index 0ea8128468c3..0979fac9414d 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -150,6 +150,7 @@ COND_SYSCALL(set_robust_list); + COND_SYSCALL_COMPAT(set_robust_list); + COND_SYSCALL(get_robust_list); + COND_SYSCALL_COMPAT(get_robust_list); ++COND_SYSCALL(futex_waitv); + + /* kernel/hrtimer.c */ + +-- +2.33.1 + +From 4e40f3886e134f33c50ca79bc8b323cea784bd78 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 23 Sep 2021 14:11:06 -0300 +Subject: [PATCH 2/2] futex,x86: Wire up sys_futex_waitv() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Wire up syscall entry point for x86 arch, for both i386 and x86_64. + +Signed-off-by: André Almeida +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20210923171111.300673-18-andrealmeid@collabora.com +--- + arch/x86/entry/syscalls/syscall_32.tbl | 1 + + arch/x86/entry/syscalls/syscall_64.tbl | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index 4bbc267fb36b..b2b9b9df1355 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -451,3 +451,4 @@ + 444 i386 landlock_create_ruleset sys_landlock_create_ruleset + 445 i386 landlock_add_rule sys_landlock_add_rule + 446 i386 landlock_restrict_self sys_landlock_restrict_self ++449 i386 futex_waitv sys_futex_waitv +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index ce18119ea0d0..bfd4e8f5be34 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -368,6 +368,7 @@ + 444 common landlock_create_ruleset sys_landlock_create_ruleset + 445 common landlock_add_rule sys_landlock_add_rule + 446 common landlock_restrict_self sys_landlock_restrict_self ++449 common futex_waitv sys_futex_waitv + + # + # Due to a historical design error, certain syscalls are numbered differently +-- +2.33.1 + diff --git a/linux-tkg-patches/5.14/0007-v5.14-futex_waitv.patch b/linux-tkg-patches/5.14/0007-v5.14-futex_waitv.patch new file mode 100644 index 0000000..fe51695 --- /dev/null +++ b/linux-tkg-patches/5.14/0007-v5.14-futex_waitv.patch @@ -0,0 +1,536 @@ +From 4901e29e3c0237c52eadd2c82deb9bd6e7add5ac Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 23 Sep 2021 14:11:05 -0300 +Subject: [PATCH 1/2] futex: Implement sys_futex_waitv() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add support to wait on multiple futexes. This is the interface +implemented by this syscall: + +futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes, + unsigned int flags, struct timespec *timeout, clockid_t clockid) + +struct futex_waitv { + __u64 val; + __u64 uaddr; + __u32 flags; + __u32 __reserved; +}; + +Given an array of struct futex_waitv, wait on each uaddr. The thread +wakes if a futex_wake() is performed at any uaddr. The syscall returns +immediately if any waiter has *uaddr != val. *timeout is an optional +absolute timeout value for the operation. This syscall supports only +64bit sized timeout structs. The flags argument of the syscall should be +empty, but it can be used for future extensions. Flags for shared +futexes, sizes, etc. should be used on the individual flags of each +waiter. + +__reserved is used for explicit padding and should be 0, but it might be +used for future extensions. If the userspace uses 32-bit pointers, it +should make sure to explicitly cast it when assigning to waitv::uaddr. + +Returns the array index of one of the woken futexes. There’s no given +information of how many were woken, or any particular attribute of it +(if it’s the first woken, if it is of the smaller index...). + +Signed-off-by: André Almeida +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20210923171111.300673-17-andrealmeid@collabora.com +--- + include/linux/syscalls.h | 6 + + include/uapi/asm-generic/unistd.h | 5 +- + include/uapi/linux/futex.h | 26 +++ + kernel/futex.c | 334 ++++++++++++++++++++++++++++++ + kernel/sys_ni.c | 1 + + 5 files changed, 371 insertions(+), 1 deletion(-) + +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index 050511e8f1f8..8390482cf082 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -58,6 +58,7 @@ struct mq_attr; + struct compat_stat; + struct old_timeval32; + struct robust_list_head; ++struct futex_waitv; + struct getcpu_cache; + struct old_linux_dirent; + struct perf_event_attr; +@@ -623,6 +624,11 @@ asmlinkage long sys_get_robust_list(int pid, + asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, + size_t len); + ++asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, ++ unsigned int nr_futexes, unsigned int flags, ++ struct __kernel_timespec __user *timeout, clockid_t clockid); ++ ++ + /* kernel/hrtimer.c */ + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, + struct __kernel_timespec __user *rmtp); +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index d2a942086fcb..3f55ac23cea9 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -878,8 +878,11 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) + __SYSCALL(__NR_memfd_secret, sys_memfd_secret) + #endif + ++#define __NR_futex_waitv 449 ++__SYSCALL(__NR_futex_waitv, sys_futex_waitv) ++ + #undef __NR_syscalls +-#define __NR_syscalls 448 ++#define __NR_syscalls 450 + + /* + * 32 bit systems traditionally used different +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e..1666f5e4b837 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -41,6 +41,32 @@ + #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) + ++ /* ++ * Flags to specify the bit length of the futex word for futex2 syscalls. ++ * Currently, only 32 is supported. ++ */ ++#define FUTEX_32 2 ++ ++/* ++ * Max numbers of elements in a futex_waitv array ++ */ ++#define FUTEX_WAITV_MAX 128 ++ ++/** ++ * struct futex_waitv - A waiter for vectorized wait ++ * @val: Expected value at uaddr ++ * @uaddr: User address to wait on ++ * @flags: Flags for this waiter ++ * @__reserved: Reserved member to preserve data alignment. Should be 0. ++ */ ++struct futex_waitv { ++ __u64 val; ++ __u64 uaddr; ++ __u32 flags; ++ __u32 __reserved; ++}; ++ ++ + /* + * Support for robust futexes: the kernel cleans up held futexes at + * thread exit time. +diff --git a/kernel/futex.c b/kernel/futex.c +index 408cad5e8968..d7dc0bd9379c 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -227,6 +227,18 @@ static const struct futex_q futex_q_init = { + .bitset = FUTEX_BITSET_MATCH_ANY + }; + ++/** ++ * struct futex_vector - Auxiliary struct for futex_waitv() ++ * @w: Userspace provided data ++ * @q: Kernel side data ++ * ++ * Struct used to build an array with all data need for futex_waitv() ++ */ ++struct futex_vector { ++ struct futex_waitv w; ++ struct futex_q q; ++}; ++ + /* + * Hash buckets are shared by all the futex_keys that hash to the same + * location. Each key may have multiple futex_q structures, one for each task +@@ -3962,6 +3974,328 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + } + #endif /* CONFIG_COMPAT */ + ++/* Mask of available flags for each futex in futex_waitv list */ ++#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) ++ ++/** ++ * futex_parse_waitv - Parse a waitv array from userspace ++ * @futexv: Kernel side list of waiters to be filled ++ * @uwaitv: Userspace list to be parsed ++ * @nr_futexes: Length of futexv ++ * ++ * Return: Error code on failure, 0 on success ++ */ ++static int futex_parse_waitv(struct futex_vector *futexv, ++ struct futex_waitv __user *uwaitv, ++ unsigned int nr_futexes) ++{ ++ struct futex_waitv aux; ++ unsigned int i; ++ ++ for (i = 0; i < nr_futexes; i++) { ++ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) ++ return -EFAULT; ++ ++ if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved) ++ return -EINVAL; ++ ++ if (!(aux.flags & FUTEX_32)) ++ return -EINVAL; ++ ++ futexv[i].w.flags = aux.flags; ++ futexv[i].w.val = aux.val; ++ futexv[i].w.uaddr = aux.uaddr; ++ futexv[i].q = futex_q_init; ++ } ++ ++ return 0; ++} ++ ++/** ++ * unqueue_multiple - Remove various futexes from their hash bucket ++ * @v: The list of futexes to unqueue ++ * @count: Number of futexes in the list ++ * ++ * Helper to unqueue a list of futexes. This can't fail. ++ * ++ * Return: ++ * - >=0 - Index of the last futex that was awoken; ++ * - -1 - No futex was awoken ++ */ ++static int unqueue_multiple(struct futex_vector *v, int count) ++{ ++ int ret = -1, i; ++ ++ for (i = 0; i < count; i++) { ++ if (!unqueue_me(&v[i].q)) ++ ret = i; ++ } ++ ++ return ret; ++} ++ ++/** ++ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes ++ * @vs: The futex list to wait on ++ * @count: The size of the list ++ * @woken: Index of the last woken futex, if any. Used to notify the ++ * caller that it can return this index to userspace (return parameter) ++ * ++ * Prepare multiple futexes in a single step and enqueue them. This may fail if ++ * the futex list is invalid or if any futex was already awoken. On success the ++ * task is ready to interruptible sleep. ++ * ++ * Return: ++ * - 1 - One of the futexes was woken by another thread ++ * - 0 - Success ++ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL ++ */ ++static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) ++{ ++ struct futex_hash_bucket *hb; ++ bool retry = false; ++ int ret, i; ++ u32 uval; ++ ++ /* ++ * Enqueuing multiple futexes is tricky, because we need to enqueue ++ * each futex on the list before dealing with the next one to avoid ++ * deadlocking on the hash bucket. But, before enqueuing, we need to ++ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't ++ * lose any wake events, which cannot be done before the get_futex_key ++ * of the next key, because it calls get_user_pages, which can sleep. ++ * Thus, we fetch the list of futexes keys in two steps, by first ++ * pinning all the memory keys in the futex key, and only then we read ++ * each key and queue the corresponding futex. ++ * ++ * Private futexes doesn't need to recalculate hash in retry, so skip ++ * get_futex_key() when retrying. ++ */ ++retry: ++ for (i = 0; i < count; i++) { ++ if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) ++ continue; ++ ++ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), ++ !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), ++ &vs[i].q.key, FUTEX_READ); ++ ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; ++ struct futex_q *q = &vs[i].q; ++ u32 val = (u32)vs[i].w.val; ++ ++ hb = queue_lock(q); ++ ret = get_futex_value_locked(&uval, uaddr); ++ ++ if (!ret && uval == val) { ++ /* ++ * The bucket lock can't be held while dealing with the ++ * next futex. Queue each futex at this moment so hb can ++ * be unlocked. ++ */ ++ queue_me(q, hb); ++ continue; ++ } ++ ++ queue_unlock(hb); ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * Even if something went wrong, if we find out that a futex ++ * was woken, we don't return error and return this index to ++ * userspace ++ */ ++ *woken = unqueue_multiple(vs, i); ++ if (*woken >= 0) ++ return 1; ++ ++ if (ret) { ++ /* ++ * If we need to handle a page fault, we need to do so ++ * without any lock and any enqueued futex (otherwise ++ * we could lose some wakeup). So we do it here, after ++ * undoing all the work done so far. In success, we ++ * retry all the work. ++ */ ++ if (get_user(uval, uaddr)) ++ return -EFAULT; ++ ++ retry = true; ++ goto retry; ++ } ++ ++ if (uval != val) ++ return -EWOULDBLOCK; ++ } ++ ++ return 0; ++} ++ ++/** ++ * futex_sleep_multiple - Check sleeping conditions and sleep ++ * @vs: List of futexes to wait for ++ * @count: Length of vs ++ * @to: Timeout ++ * ++ * Sleep if and only if the timeout hasn't expired and no futex on the list has ++ * been woken up. ++ */ ++static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, ++ struct hrtimer_sleeper *to) ++{ ++ if (to && !to->task) ++ return; ++ ++ for (; count; count--, vs++) { ++ if (!READ_ONCE(vs->q.lock_ptr)) ++ return; ++ } ++ ++ freezable_schedule(); ++} ++ ++/** ++ * futex_wait_multiple - Prepare to wait on and enqueue several futexes ++ * @vs: The list of futexes to wait on ++ * @count: The number of objects ++ * @to: Timeout before giving up and returning to userspace ++ * ++ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function ++ * sleeps on a group of futexes and returns on the first futex that is ++ * wake, or after the timeout has elapsed. ++ * ++ * Return: ++ * - >=0 - Hint to the futex that was awoken ++ * - <0 - On error ++ */ ++int futex_wait_multiple(struct futex_vector *vs, unsigned int count, ++ struct hrtimer_sleeper *to) ++{ ++ int ret, hint = 0; ++ ++ if (to) ++ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); ++ ++ while (1) { ++ ret = futex_wait_multiple_setup(vs, count, &hint); ++ if (ret) { ++ if (ret > 0) { ++ /* A futex was woken during setup */ ++ ret = hint; ++ } ++ return ret; ++ } ++ ++ futex_sleep_multiple(vs, count, to); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ ret = unqueue_multiple(vs, count); ++ if (ret >= 0) ++ return ret; ++ ++ if (to && !to->task) ++ return -ETIMEDOUT; ++ else if (signal_pending(current)) ++ return -ERESTARTSYS; ++ /* ++ * The final case is a spurious wakeup, for ++ * which just retry. ++ */ ++ } ++} ++/* Mask of available flags for each futex in futex_waitv list */ ++#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) ++ ++/** ++ * sys_futex_waitv - Wait on a list of futexes ++ * @waiters: List of futexes to wait on ++ * @nr_futexes: Length of futexv ++ * @flags: Flag for timeout (monotonic/realtime) ++ * @timeout: Optional absolute timeout. ++ * @clockid: Clock to be used for the timeout, realtime or monotonic. ++ * ++ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes ++ * if a futex_wake() is performed at any uaddr. The syscall returns immediately ++ * if any waiter has *uaddr != val. *timeout is an optional timeout value for ++ * the operation. Each waiter has individual flags. The `flags` argument for ++ * the syscall should be used solely for specifying the timeout as realtime, if ++ * needed. Flags for private futexes, sizes, etc. should be used on the ++ * individual flags of each waiter. ++ * ++ * Returns the array index of one of the woken futexes. No further information ++ * is provided: any number of other futexes may also have been woken by the ++ * same event, and if more than one futex was woken, the retrned index may ++ * refer to any one of them. (It is not necessaryily the futex with the ++ * smallest index, nor the one most recently woken, nor...) ++ */ ++ ++SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, ++ unsigned int, nr_futexes, unsigned int, flags, ++ struct __kernel_timespec __user *, timeout, clockid_t, clockid) ++{ ++ struct hrtimer_sleeper to; ++ struct futex_vector *futexv; ++ struct timespec64 ts; ++ ktime_t time; ++ int ret; ++ ++ /* This syscall supports no flags for now */ ++ if (flags) ++ return -EINVAL; ++ ++ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) ++ return -EINVAL; ++ ++ if (timeout) { ++ int flag_clkid = 0, flag_init = 0; ++ ++ if (clockid == CLOCK_REALTIME) { ++ flag_clkid = FLAGS_CLOCKRT; ++ flag_init = FUTEX_CLOCK_REALTIME; ++ } ++ ++ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) ++ return -EINVAL; ++ ++ if (get_timespec64(&ts, timeout)) ++ return -EFAULT; ++ ++ /* ++ * Since there's no opcode for futex_waitv, use ++ * FUTEX_WAIT_BITSET that uses absolute timeout as well ++ */ ++ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); ++ if (ret) ++ return ret; ++ ++ futex_setup_timer(&time, &to, flag_clkid, 0); ++ } ++ ++ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); ++ if (!futexv) ++ return -ENOMEM; ++ ++ ret = futex_parse_waitv(futexv, waiters, nr_futexes); ++ if (!ret) ++ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); ++ ++ if (timeout) { ++ hrtimer_cancel(&to.timer); ++ destroy_hrtimer_on_stack(&to.timer); ++ } ++ ++ kfree(futexv); ++ return ret; ++} ++ + #ifdef CONFIG_COMPAT_32BIT_TIME + SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index 0ea8128468c3..0979fac9414d 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -150,6 +150,7 @@ COND_SYSCALL(set_robust_list); + COND_SYSCALL_COMPAT(set_robust_list); + COND_SYSCALL(get_robust_list); + COND_SYSCALL_COMPAT(get_robust_list); ++COND_SYSCALL(futex_waitv); + + /* kernel/hrtimer.c */ + +-- +2.33.1 + +From 4e40f3886e134f33c50ca79bc8b323cea784bd78 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 23 Sep 2021 14:11:06 -0300 +Subject: [PATCH 2/2] futex,x86: Wire up sys_futex_waitv() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Wire up syscall entry point for x86 arch, for both i386 and x86_64. + +Signed-off-by: André Almeida +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20210923171111.300673-18-andrealmeid@collabora.com +--- + arch/x86/entry/syscalls/syscall_32.tbl | 1 + + arch/x86/entry/syscalls/syscall_64.tbl | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index 4bbc267fb36b..b2b9b9df1355 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -452,3 +452,4 @@ + 445 i386 landlock_add_rule sys_landlock_add_rule + 446 i386 landlock_restrict_self sys_landlock_restrict_self + 447 i386 memfd_secret sys_memfd_secret ++449 i386 futex_waitv sys_futex_waitv +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index ce18119ea0d0..bfd4e8f5be34 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -369,6 +369,7 @@ + 445 common landlock_add_rule sys_landlock_add_rule + 446 common landlock_restrict_self sys_landlock_restrict_self + 447 common memfd_secret sys_memfd_secret ++449 common futex_waitv sys_futex_waitv + + # + # Due to a historical design error, certain syscalls are numbered differently +-- +2.33.1 + diff --git a/linux-tkg-patches/5.15/0007-v5.15-futex_waitv.patch b/linux-tkg-patches/5.15/0007-v5.15-futex_waitv.patch new file mode 100644 index 0000000..0813182 --- /dev/null +++ b/linux-tkg-patches/5.15/0007-v5.15-futex_waitv.patch @@ -0,0 +1,536 @@ +From 4901e29e3c0237c52eadd2c82deb9bd6e7add5ac Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 23 Sep 2021 14:11:05 -0300 +Subject: [PATCH 1/2] futex: Implement sys_futex_waitv() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add support to wait on multiple futexes. This is the interface +implemented by this syscall: + +futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes, + unsigned int flags, struct timespec *timeout, clockid_t clockid) + +struct futex_waitv { + __u64 val; + __u64 uaddr; + __u32 flags; + __u32 __reserved; +}; + +Given an array of struct futex_waitv, wait on each uaddr. The thread +wakes if a futex_wake() is performed at any uaddr. The syscall returns +immediately if any waiter has *uaddr != val. *timeout is an optional +absolute timeout value for the operation. This syscall supports only +64bit sized timeout structs. The flags argument of the syscall should be +empty, but it can be used for future extensions. Flags for shared +futexes, sizes, etc. should be used on the individual flags of each +waiter. + +__reserved is used for explicit padding and should be 0, but it might be +used for future extensions. If the userspace uses 32-bit pointers, it +should make sure to explicitly cast it when assigning to waitv::uaddr. + +Returns the array index of one of the woken futexes. There’s no given +information of how many were woken, or any particular attribute of it +(if it’s the first woken, if it is of the smaller index...). + +Signed-off-by: André Almeida +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20210923171111.300673-17-andrealmeid@collabora.com +--- + include/linux/syscalls.h | 6 + + include/uapi/asm-generic/unistd.h | 5 +- + include/uapi/linux/futex.h | 26 +++ + kernel/futex.c | 334 ++++++++++++++++++++++++++++++ + kernel/sys_ni.c | 1 + + 5 files changed, 371 insertions(+), 1 deletion(-) + +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index 050511e8f1f8..8390482cf082 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -58,6 +58,7 @@ struct mq_attr; + struct compat_stat; + struct old_timeval32; + struct robust_list_head; ++struct futex_waitv; + struct getcpu_cache; + struct old_linux_dirent; + struct perf_event_attr; +@@ -623,6 +624,11 @@ asmlinkage long sys_get_robust_list(int pid, + asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, + size_t len); + ++asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, ++ unsigned int nr_futexes, unsigned int flags, ++ struct __kernel_timespec __user *timeout, clockid_t clockid); ++ ++ + /* kernel/hrtimer.c */ + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, + struct __kernel_timespec __user *rmtp); +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index d2a942086fcb..3f55ac23cea9 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -880,8 +880,11 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) + #define __NR_process_mrelease 448 + __SYSCALL(__NR_process_mrelease, sys_process_mrelease) + ++#define __NR_futex_waitv 449 ++__SYSCALL(__NR_futex_waitv, sys_futex_waitv) ++ + #undef __NR_syscalls +-#define __NR_syscalls 449 ++#define __NR_syscalls 450 + + /* + * 32 bit systems traditionally used different +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e..1666f5e4b837 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -41,6 +41,32 @@ + #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) + ++ /* ++ * Flags to specify the bit length of the futex word for futex2 syscalls. ++ * Currently, only 32 is supported. ++ */ ++#define FUTEX_32 2 ++ ++/* ++ * Max numbers of elements in a futex_waitv array ++ */ ++#define FUTEX_WAITV_MAX 128 ++ ++/** ++ * struct futex_waitv - A waiter for vectorized wait ++ * @val: Expected value at uaddr ++ * @uaddr: User address to wait on ++ * @flags: Flags for this waiter ++ * @__reserved: Reserved member to preserve data alignment. Should be 0. ++ */ ++struct futex_waitv { ++ __u64 val; ++ __u64 uaddr; ++ __u32 flags; ++ __u32 __reserved; ++}; ++ ++ + /* + * Support for robust futexes: the kernel cleans up held futexes at + * thread exit time. +diff --git a/kernel/futex.c b/kernel/futex.c +index 408cad5e8968..d7dc0bd9379c 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -285,6 +285,18 @@ static const struct futex_q futex_q_init = { + .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), + }; + ++/** ++ * struct futex_vector - Auxiliary struct for futex_waitv() ++ * @w: Userspace provided data ++ * @q: Kernel side data ++ * ++ * Struct used to build an array with all data need for futex_waitv() ++ */ ++struct futex_vector { ++ struct futex_waitv w; ++ struct futex_q q; ++}; ++ + /* + * Hash buckets are shared by all the futex_keys that hash to the same + * location. Each key may have multiple futex_q structures, one for each task +@@ -3962,6 +3974,328 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + } + #endif /* CONFIG_COMPAT */ + ++/* Mask of available flags for each futex in futex_waitv list */ ++#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) ++ ++/** ++ * futex_parse_waitv - Parse a waitv array from userspace ++ * @futexv: Kernel side list of waiters to be filled ++ * @uwaitv: Userspace list to be parsed ++ * @nr_futexes: Length of futexv ++ * ++ * Return: Error code on failure, 0 on success ++ */ ++static int futex_parse_waitv(struct futex_vector *futexv, ++ struct futex_waitv __user *uwaitv, ++ unsigned int nr_futexes) ++{ ++ struct futex_waitv aux; ++ unsigned int i; ++ ++ for (i = 0; i < nr_futexes; i++) { ++ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) ++ return -EFAULT; ++ ++ if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved) ++ return -EINVAL; ++ ++ if (!(aux.flags & FUTEX_32)) ++ return -EINVAL; ++ ++ futexv[i].w.flags = aux.flags; ++ futexv[i].w.val = aux.val; ++ futexv[i].w.uaddr = aux.uaddr; ++ futexv[i].q = futex_q_init; ++ } ++ ++ return 0; ++} ++ ++/** ++ * unqueue_multiple - Remove various futexes from their hash bucket ++ * @v: The list of futexes to unqueue ++ * @count: Number of futexes in the list ++ * ++ * Helper to unqueue a list of futexes. This can't fail. ++ * ++ * Return: ++ * - >=0 - Index of the last futex that was awoken; ++ * - -1 - No futex was awoken ++ */ ++static int unqueue_multiple(struct futex_vector *v, int count) ++{ ++ int ret = -1, i; ++ ++ for (i = 0; i < count; i++) { ++ if (!unqueue_me(&v[i].q)) ++ ret = i; ++ } ++ ++ return ret; ++} ++ ++/** ++ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes ++ * @vs: The futex list to wait on ++ * @count: The size of the list ++ * @woken: Index of the last woken futex, if any. Used to notify the ++ * caller that it can return this index to userspace (return parameter) ++ * ++ * Prepare multiple futexes in a single step and enqueue them. This may fail if ++ * the futex list is invalid or if any futex was already awoken. On success the ++ * task is ready to interruptible sleep. ++ * ++ * Return: ++ * - 1 - One of the futexes was woken by another thread ++ * - 0 - Success ++ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL ++ */ ++static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) ++{ ++ struct futex_hash_bucket *hb; ++ bool retry = false; ++ int ret, i; ++ u32 uval; ++ ++ /* ++ * Enqueuing multiple futexes is tricky, because we need to enqueue ++ * each futex on the list before dealing with the next one to avoid ++ * deadlocking on the hash bucket. But, before enqueuing, we need to ++ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't ++ * lose any wake events, which cannot be done before the get_futex_key ++ * of the next key, because it calls get_user_pages, which can sleep. ++ * Thus, we fetch the list of futexes keys in two steps, by first ++ * pinning all the memory keys in the futex key, and only then we read ++ * each key and queue the corresponding futex. ++ * ++ * Private futexes doesn't need to recalculate hash in retry, so skip ++ * get_futex_key() when retrying. ++ */ ++retry: ++ for (i = 0; i < count; i++) { ++ if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) ++ continue; ++ ++ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), ++ !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), ++ &vs[i].q.key, FUTEX_READ); ++ ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; ++ struct futex_q *q = &vs[i].q; ++ u32 val = (u32)vs[i].w.val; ++ ++ hb = queue_lock(q); ++ ret = get_futex_value_locked(&uval, uaddr); ++ ++ if (!ret && uval == val) { ++ /* ++ * The bucket lock can't be held while dealing with the ++ * next futex. Queue each futex at this moment so hb can ++ * be unlocked. ++ */ ++ queue_me(q, hb); ++ continue; ++ } ++ ++ queue_unlock(hb); ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * Even if something went wrong, if we find out that a futex ++ * was woken, we don't return error and return this index to ++ * userspace ++ */ ++ *woken = unqueue_multiple(vs, i); ++ if (*woken >= 0) ++ return 1; ++ ++ if (ret) { ++ /* ++ * If we need to handle a page fault, we need to do so ++ * without any lock and any enqueued futex (otherwise ++ * we could lose some wakeup). So we do it here, after ++ * undoing all the work done so far. In success, we ++ * retry all the work. ++ */ ++ if (get_user(uval, uaddr)) ++ return -EFAULT; ++ ++ retry = true; ++ goto retry; ++ } ++ ++ if (uval != val) ++ return -EWOULDBLOCK; ++ } ++ ++ return 0; ++} ++ ++/** ++ * futex_sleep_multiple - Check sleeping conditions and sleep ++ * @vs: List of futexes to wait for ++ * @count: Length of vs ++ * @to: Timeout ++ * ++ * Sleep if and only if the timeout hasn't expired and no futex on the list has ++ * been woken up. ++ */ ++static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, ++ struct hrtimer_sleeper *to) ++{ ++ if (to && !to->task) ++ return; ++ ++ for (; count; count--, vs++) { ++ if (!READ_ONCE(vs->q.lock_ptr)) ++ return; ++ } ++ ++ freezable_schedule(); ++} ++ ++/** ++ * futex_wait_multiple - Prepare to wait on and enqueue several futexes ++ * @vs: The list of futexes to wait on ++ * @count: The number of objects ++ * @to: Timeout before giving up and returning to userspace ++ * ++ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function ++ * sleeps on a group of futexes and returns on the first futex that is ++ * wake, or after the timeout has elapsed. ++ * ++ * Return: ++ * - >=0 - Hint to the futex that was awoken ++ * - <0 - On error ++ */ ++int futex_wait_multiple(struct futex_vector *vs, unsigned int count, ++ struct hrtimer_sleeper *to) ++{ ++ int ret, hint = 0; ++ ++ if (to) ++ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); ++ ++ while (1) { ++ ret = futex_wait_multiple_setup(vs, count, &hint); ++ if (ret) { ++ if (ret > 0) { ++ /* A futex was woken during setup */ ++ ret = hint; ++ } ++ return ret; ++ } ++ ++ futex_sleep_multiple(vs, count, to); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ ret = unqueue_multiple(vs, count); ++ if (ret >= 0) ++ return ret; ++ ++ if (to && !to->task) ++ return -ETIMEDOUT; ++ else if (signal_pending(current)) ++ return -ERESTARTSYS; ++ /* ++ * The final case is a spurious wakeup, for ++ * which just retry. ++ */ ++ } ++} ++/* Mask of available flags for each futex in futex_waitv list */ ++#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) ++ ++/** ++ * sys_futex_waitv - Wait on a list of futexes ++ * @waiters: List of futexes to wait on ++ * @nr_futexes: Length of futexv ++ * @flags: Flag for timeout (monotonic/realtime) ++ * @timeout: Optional absolute timeout. ++ * @clockid: Clock to be used for the timeout, realtime or monotonic. ++ * ++ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes ++ * if a futex_wake() is performed at any uaddr. The syscall returns immediately ++ * if any waiter has *uaddr != val. *timeout is an optional timeout value for ++ * the operation. Each waiter has individual flags. The `flags` argument for ++ * the syscall should be used solely for specifying the timeout as realtime, if ++ * needed. Flags for private futexes, sizes, etc. should be used on the ++ * individual flags of each waiter. ++ * ++ * Returns the array index of one of the woken futexes. No further information ++ * is provided: any number of other futexes may also have been woken by the ++ * same event, and if more than one futex was woken, the retrned index may ++ * refer to any one of them. (It is not necessaryily the futex with the ++ * smallest index, nor the one most recently woken, nor...) ++ */ ++ ++SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, ++ unsigned int, nr_futexes, unsigned int, flags, ++ struct __kernel_timespec __user *, timeout, clockid_t, clockid) ++{ ++ struct hrtimer_sleeper to; ++ struct futex_vector *futexv; ++ struct timespec64 ts; ++ ktime_t time; ++ int ret; ++ ++ /* This syscall supports no flags for now */ ++ if (flags) ++ return -EINVAL; ++ ++ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) ++ return -EINVAL; ++ ++ if (timeout) { ++ int flag_clkid = 0, flag_init = 0; ++ ++ if (clockid == CLOCK_REALTIME) { ++ flag_clkid = FLAGS_CLOCKRT; ++ flag_init = FUTEX_CLOCK_REALTIME; ++ } ++ ++ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) ++ return -EINVAL; ++ ++ if (get_timespec64(&ts, timeout)) ++ return -EFAULT; ++ ++ /* ++ * Since there's no opcode for futex_waitv, use ++ * FUTEX_WAIT_BITSET that uses absolute timeout as well ++ */ ++ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); ++ if (ret) ++ return ret; ++ ++ futex_setup_timer(&time, &to, flag_clkid, 0); ++ } ++ ++ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); ++ if (!futexv) ++ return -ENOMEM; ++ ++ ret = futex_parse_waitv(futexv, waiters, nr_futexes); ++ if (!ret) ++ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); ++ ++ if (timeout) { ++ hrtimer_cancel(&to.timer); ++ destroy_hrtimer_on_stack(&to.timer); ++ } ++ ++ kfree(futexv); ++ return ret; ++} ++ + #ifdef CONFIG_COMPAT_32BIT_TIME + SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index 0ea8128468c3..0979fac9414d 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -150,6 +150,7 @@ COND_SYSCALL(set_robust_list); + COND_SYSCALL_COMPAT(set_robust_list); + COND_SYSCALL(get_robust_list); + COND_SYSCALL_COMPAT(get_robust_list); ++COND_SYSCALL(futex_waitv); + + /* kernel/hrtimer.c */ + +-- +2.33.1 + +From 4e40f3886e134f33c50ca79bc8b323cea784bd78 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 23 Sep 2021 14:11:06 -0300 +Subject: [PATCH 2/2] futex,x86: Wire up sys_futex_waitv() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Wire up syscall entry point for x86 arch, for both i386 and x86_64. + +Signed-off-by: André Almeida +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20210923171111.300673-18-andrealmeid@collabora.com +--- + arch/x86/entry/syscalls/syscall_32.tbl | 1 + + arch/x86/entry/syscalls/syscall_64.tbl | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index 4bbc267fb36b..b2b9b9df1355 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -453,3 +453,4 @@ + 446 i386 landlock_restrict_self sys_landlock_restrict_self + 447 i386 memfd_secret sys_memfd_secret + 448 i386 process_mrelease sys_process_mrelease ++449 i386 futex_waitv sys_futex_waitv +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index ce18119ea0d0..bfd4e8f5be34 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -370,6 +370,7 @@ + 446 common landlock_restrict_self sys_landlock_restrict_self + 447 common memfd_secret sys_memfd_secret + 448 common process_mrelease sys_process_mrelease ++449 common futex_waitv sys_futex_waitv + + # + # Due to a historical design error, certain syscalls are numbered differently +-- +2.33.1 +