linux513/514/515-tkg: Add futex_waitv() patchset from 5.16 as a toggle option (#342)
* linux513/514/515-tkg: Add futex_waitv() patchset from 5.16 as a toggle option Not enabled by default since this make currents public Wine/Proton builds with old fsync interfaces broken. https://github.com/ValveSoftware/wine/pull/128 https://github.com/andrealmeid/futex_waitv_patches * Minor message/comment tweaks Co-authored-by: Etienne JUVIGNY <ti3nou@gmail.com>
This commit is contained in:
6
PKGBUILD
6
PKGBUILD
@@ -500,6 +500,7 @@ case $_basever in
|
||||
0006-add-acs-overrides_iommu.patch
|
||||
0007-v5.13-fsync.patch
|
||||
0007-v5.13-futex2_interface.patch
|
||||
0007-v5.13-futex_waitv.patch
|
||||
0007-v5.13-winesync.patch
|
||||
0008-5.13-bcachefs.patch
|
||||
0009-glitched-ondemand-bmq.patch
|
||||
@@ -530,6 +531,7 @@ case $_basever in
|
||||
'19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a'
|
||||
'89d837bfea3515504b1c99fc881ebdc4f15e2999558127a263e795fc69408a39'
|
||||
'9ec679871cba674cf876ba836cde969296ae5034bcc10e1ec39b372e6e07aab0'
|
||||
'0e3473c19e5513bee886f03cf2476f746d8b5b2fbc0841c9d60d609b16a97c14'
|
||||
'034d12a73b507133da2c69a34d61efd2f6b6618549650aa26d748142d22002e1'
|
||||
'b0004bc559653fd8719b8adcfa1ead1075db3425d30d7d7adb8cbc6296386a8f'
|
||||
'9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177'
|
||||
@@ -560,6 +562,7 @@ case $_basever in
|
||||
0006-add-acs-overrides_iommu.patch
|
||||
0007-v5.14-fsync.patch
|
||||
0007-v5.14-futex2_interface.patch
|
||||
0007-v5.14-futex_waitv.patch
|
||||
0007-v5.14-winesync.patch
|
||||
#0008-5.14-bcachefs.patch
|
||||
0009-glitched-ondemand-bmq.patch
|
||||
@@ -586,6 +589,7 @@ case $_basever in
|
||||
'19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a'
|
||||
'aa67e81a27d9062e463594acb91eca6dd13388f23cbe53ca56298f9dba61cc10'
|
||||
'efe5e21706fdf64559ead866c85a5d88c5c3f743d814410df3810ca61cc5b966'
|
||||
'5742277f41f22bf29fa9742562946b8a01377f8a22adb42ceed3607541c1d5b6'
|
||||
'034d12a73b507133da2c69a34d61efd2f6b6618549650aa26d748142d22002e1'
|
||||
'9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177'
|
||||
'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911'
|
||||
@@ -614,6 +618,7 @@ case $_basever in
|
||||
0006-add-acs-overrides_iommu.patch
|
||||
0007-v5.15-fsync.patch
|
||||
#0007-v5.15-futex2_interface.patch
|
||||
0007-v5.15-futex_waitv.patch
|
||||
0007-v5.15-winesync.patch
|
||||
#0008-5.14-bcachefs.patch
|
||||
#0009-glitched-ondemand-bmq.patch
|
||||
@@ -640,6 +645,7 @@ case $_basever in
|
||||
'19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a'
|
||||
'6c4f0099896f69e56ebd8c9eac266ac8ad993acecd50945e0e84ef6f95f9ddca'
|
||||
#'efe5e21706fdf64559ead866c85a5d88c5c3f743d814410df3810ca61cc5b966'
|
||||
'c8f7c50d9b1418ba22b5ca735c47111a162be416109714d26a674162e5b2cb97'
|
||||
'034d12a73b507133da2c69a34d61efd2f6b6618549650aa26d748142d22002e1'
|
||||
#'9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177'
|
||||
#'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911'
|
||||
|
@@ -154,6 +154,11 @@ _fsync="true"
|
||||
# https://gitlab.collabora.com/tonyk/linux/-/tree/futex2-dev
|
||||
_futex2="true"
|
||||
|
||||
# Set to "true" to enable backported patches to add support for the futex_waitv() syscall, a new interface for fsync. It will appear in mainline at Linux 5.16 release and requires a wine/proton with builtin support for it. It's expected to be available in Valve Proton 6.3 stable soon - https://github.com/ValveSoftware/wine/pull/128
|
||||
# !! Disables fsync/futex2 interfaces support !!
|
||||
# https://github.com/andrealmeid/futex_waitv_patches
|
||||
_futex_waitv="false"
|
||||
|
||||
# Set to "true" to enable support for winesync, an experimental replacement for esync - requires patched wine - https://repo.or.cz/linux/zf.git/shortlog/refs/heads/winesync
|
||||
# ! Can't be used on multiple kernels installed side-by-side, which will require https://aur.archlinux.org/packages/winesync-dkms/ instead of this option !
|
||||
_winesync="false"
|
||||
|
@@ -571,7 +571,7 @@ _tkg_srcprep() {
|
||||
_enable "CRYPTO_LZ4" "CRYPTO_LZ4HC" "LZ4_COMPRESS" "LZ4HC_COMPRESS" "ZSWAP_COMPRESSOR_DEFAULT_LZ4" "CMDLINE_BOOL" "CONFIG_BLK_DEV_LOOP"
|
||||
_disable "DEBUG_FORCE_FUNCTION_ALIGN_64B"
|
||||
scripts/config --set-str "ZSWAP_COMPRESSOR_DEFAULT" "lz4"
|
||||
if [ "$_futex2" = "true" ] && [ "$_basever" != "54" ] && [ "$_basever" != "57" ] && [ "$_basever" != "58" ] && [ "$_basever" != "59" ]; then
|
||||
if [ "$_futex2" = "true" ] && [ "$_futex_waitv" != "true" ] && [ "$_basever" != "54" ] && [ "$_basever" != "57" ] && [ "$_basever" != "58" ] && [ "$_basever" != "59" ]; then
|
||||
sed -i -e 's/# CONFIG_EXPERT is not set/CONFIG_EXPERT=y/' ./.config
|
||||
echo -e "\r# start of config expert\r
|
||||
# CONFIG_DEBUG_RSEQ is not set\r
|
||||
@@ -1140,6 +1140,25 @@ CONFIG_DEBUG_INFO_BTF_MODULES=y\r
|
||||
fi
|
||||
fi
|
||||
|
||||
# futex_waitv support
|
||||
tkgpatch="$srcdir/0007-v${_basekernel}-futex_waitv.patch"
|
||||
if [ -e "$tkgpatch" ]; then
|
||||
if [ -z "$_futex_waitv" ]; then
|
||||
plain ""
|
||||
plain "Enable support for futex_waitv, backported patches for fsync from 5.16 Kernel"
|
||||
plain "! Will disable fsync/futex2 patchsets !"
|
||||
plain "https://github.com/andrealmeid/futex_waitv_patches"
|
||||
plain "https://github.com/ValveSoftware/wine/pull/128"
|
||||
read -rp "`echo $' > N/y : '`" CONDITION9;
|
||||
fi
|
||||
if [[ "$CONDITION9" =~ [yY] ]] || [ "$_futex_waitv" = "true" ]; then
|
||||
_msg="Patching futex_waitv support"
|
||||
_tkg_patcher
|
||||
_fsync="false"
|
||||
_futex2="false"
|
||||
fi
|
||||
fi
|
||||
|
||||
# fsync support
|
||||
tkgpatch="$srcdir/0007-v${_basekernel}-fsync.patch"
|
||||
if [ -e "$tkgpatch" ]; then
|
||||
@@ -1147,9 +1166,9 @@ CONFIG_DEBUG_INFO_BTF_MODULES=y\r
|
||||
plain ""
|
||||
plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+"
|
||||
plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305"
|
||||
read -rp "`echo $' > N/y : '`" CONDITION9;
|
||||
read -rp "`echo $' > N/y : '`" CONDITION10;
|
||||
fi
|
||||
if [[ "$CONDITION9" =~ [yY] ]] || [ "$_fsync" = "true" ]; then
|
||||
if [[ "$CONDITION10" =~ [yY] ]] || [ "$_fsync" = "true" ]; then
|
||||
_msg="Patching Fsync support"
|
||||
_tkg_patcher
|
||||
fi
|
||||
@@ -1164,9 +1183,9 @@ CONFIG_DEBUG_INFO_BTF_MODULES=y\r
|
||||
plain "Can be enabled alongside regular fsync patchset to have a fallback option"
|
||||
plain "https://gitlab.collabora.com/tonyk/linux/-/tree/futex2-dev"
|
||||
plain "https://github.com/ValveSoftware/Proton/issues/4568"
|
||||
read -rp "`echo $' > N/y : '`" CONDITION10;
|
||||
read -rp "`echo $' > N/y : '`" CONDITION11;
|
||||
fi
|
||||
if [[ "$CONDITION10" =~ [yY] ]] || [ "$_futex2" = "true" ]; then
|
||||
if [[ "$CONDITION11" =~ [yY] ]] || [ "$_futex2" = "true" ]; then
|
||||
_msg="Patching futex2 support"
|
||||
_tkg_patcher
|
||||
_enable "FUTEX2"
|
||||
|
536
linux-tkg-patches/5.13/0007-v5.13-futex_waitv.patch
Normal file
536
linux-tkg-patches/5.13/0007-v5.13-futex_waitv.patch
Normal file
@@ -0,0 +1,536 @@
|
||||
From 4901e29e3c0237c52eadd2c82deb9bd6e7add5ac Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
|
||||
Date: Thu, 23 Sep 2021 14:11:05 -0300
|
||||
Subject: [PATCH 1/2] futex: Implement sys_futex_waitv()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add support to wait on multiple futexes. This is the interface
|
||||
implemented by this syscall:
|
||||
|
||||
futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes,
|
||||
unsigned int flags, struct timespec *timeout, clockid_t clockid)
|
||||
|
||||
struct futex_waitv {
|
||||
__u64 val;
|
||||
__u64 uaddr;
|
||||
__u32 flags;
|
||||
__u32 __reserved;
|
||||
};
|
||||
|
||||
Given an array of struct futex_waitv, wait on each uaddr. The thread
|
||||
wakes if a futex_wake() is performed at any uaddr. The syscall returns
|
||||
immediately if any waiter has *uaddr != val. *timeout is an optional
|
||||
absolute timeout value for the operation. This syscall supports only
|
||||
64bit sized timeout structs. The flags argument of the syscall should be
|
||||
empty, but it can be used for future extensions. Flags for shared
|
||||
futexes, sizes, etc. should be used on the individual flags of each
|
||||
waiter.
|
||||
|
||||
__reserved is used for explicit padding and should be 0, but it might be
|
||||
used for future extensions. If the userspace uses 32-bit pointers, it
|
||||
should make sure to explicitly cast it when assigning to waitv::uaddr.
|
||||
|
||||
Returns the array index of one of the woken futexes. There’s no given
|
||||
information of how many were woken, or any particular attribute of it
|
||||
(if it’s the first woken, if it is of the smaller index...).
|
||||
|
||||
Signed-off-by: André Almeida <andrealmeid@collabora.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Link: https://lore.kernel.org/r/20210923171111.300673-17-andrealmeid@collabora.com
|
||||
---
|
||||
include/linux/syscalls.h | 6 +
|
||||
include/uapi/asm-generic/unistd.h | 5 +-
|
||||
include/uapi/linux/futex.h | 26 +++
|
||||
kernel/futex.c | 334 ++++++++++++++++++++++++++++++
|
||||
kernel/sys_ni.c | 1 +
|
||||
5 files changed, 371 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
|
||||
index 050511e8f1f8..8390482cf082 100644
|
||||
--- a/include/linux/syscalls.h
|
||||
+++ b/include/linux/syscalls.h
|
||||
@@ -58,6 +58,7 @@ struct mq_attr;
|
||||
struct compat_stat;
|
||||
struct old_timeval32;
|
||||
struct robust_list_head;
|
||||
+struct futex_waitv;
|
||||
struct getcpu_cache;
|
||||
struct old_linux_dirent;
|
||||
struct perf_event_attr;
|
||||
@@ -623,6 +624,11 @@ asmlinkage long sys_get_robust_list(int pid,
|
||||
asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
|
||||
size_t len);
|
||||
|
||||
+asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
|
||||
+ unsigned int nr_futexes, unsigned int flags,
|
||||
+ struct __kernel_timespec __user *timeout, clockid_t clockid);
|
||||
+
|
||||
+
|
||||
/* kernel/hrtimer.c */
|
||||
asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
|
||||
struct __kernel_timespec __user *rmtp);
|
||||
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
|
||||
index d2a942086fcb..3f55ac23cea9 100644
|
||||
--- a/include/uapi/asm-generic/unistd.h
|
||||
+++ b/include/uapi/asm-generic/unistd.h
|
||||
@@ -872,8 +872,11 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
|
||||
#define __NR_landlock_restrict_self 446
|
||||
__SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
|
||||
|
||||
+#define __NR_futex_waitv 449
|
||||
+__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
|
||||
+
|
||||
#undef __NR_syscalls
|
||||
-#define __NR_syscalls 447
|
||||
+#define __NR_syscalls 450
|
||||
|
||||
/*
|
||||
* 32 bit systems traditionally used different
|
||||
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
|
||||
index a89eb0accd5e..1666f5e4b837 100644
|
||||
--- a/include/uapi/linux/futex.h
|
||||
+++ b/include/uapi/linux/futex.h
|
||||
@@ -41,6 +41,32 @@
|
||||
#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
|
||||
FUTEX_PRIVATE_FLAG)
|
||||
|
||||
+ /*
|
||||
+ * Flags to specify the bit length of the futex word for futex2 syscalls.
|
||||
+ * Currently, only 32 is supported.
|
||||
+ */
|
||||
+#define FUTEX_32 2
|
||||
+
|
||||
+/*
|
||||
+ * Max numbers of elements in a futex_waitv array
|
||||
+ */
|
||||
+#define FUTEX_WAITV_MAX 128
|
||||
+
|
||||
+/**
|
||||
+ * struct futex_waitv - A waiter for vectorized wait
|
||||
+ * @val: Expected value at uaddr
|
||||
+ * @uaddr: User address to wait on
|
||||
+ * @flags: Flags for this waiter
|
||||
+ * @__reserved: Reserved member to preserve data alignment. Should be 0.
|
||||
+ */
|
||||
+struct futex_waitv {
|
||||
+ __u64 val;
|
||||
+ __u64 uaddr;
|
||||
+ __u32 flags;
|
||||
+ __u32 __reserved;
|
||||
+};
|
||||
+
|
||||
+
|
||||
/*
|
||||
* Support for robust futexes: the kernel cleans up held futexes at
|
||||
* thread exit time.
|
||||
diff --git a/kernel/futex.c b/kernel/futex.c
|
||||
index 408cad5e8968..d7dc0bd9379c 100644
|
||||
--- a/kernel/futex.c
|
||||
+++ b/kernel/futex.c
|
||||
@@ -227,6 +227,18 @@ static const struct futex_q futex_q_init = {
|
||||
.bitset = FUTEX_BITSET_MATCH_ANY
|
||||
};
|
||||
|
||||
+/**
|
||||
+ * struct futex_vector - Auxiliary struct for futex_waitv()
|
||||
+ * @w: Userspace provided data
|
||||
+ * @q: Kernel side data
|
||||
+ *
|
||||
+ * Struct used to build an array with all data need for futex_waitv()
|
||||
+ */
|
||||
+struct futex_vector {
|
||||
+ struct futex_waitv w;
|
||||
+ struct futex_q q;
|
||||
+};
|
||||
+
|
||||
/*
|
||||
* Hash buckets are shared by all the futex_keys that hash to the same
|
||||
* location. Each key may have multiple futex_q structures, one for each task
|
||||
@@ -3962,6 +3974,328 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
|
||||
}
|
||||
#endif /* CONFIG_COMPAT */
|
||||
|
||||
+/* Mask of available flags for each futex in futex_waitv list */
|
||||
+#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
|
||||
+
|
||||
+/**
|
||||
+ * futex_parse_waitv - Parse a waitv array from userspace
|
||||
+ * @futexv: Kernel side list of waiters to be filled
|
||||
+ * @uwaitv: Userspace list to be parsed
|
||||
+ * @nr_futexes: Length of futexv
|
||||
+ *
|
||||
+ * Return: Error code on failure, 0 on success
|
||||
+ */
|
||||
+static int futex_parse_waitv(struct futex_vector *futexv,
|
||||
+ struct futex_waitv __user *uwaitv,
|
||||
+ unsigned int nr_futexes)
|
||||
+{
|
||||
+ struct futex_waitv aux;
|
||||
+ unsigned int i;
|
||||
+
|
||||
+ for (i = 0; i < nr_futexes; i++) {
|
||||
+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!(aux.flags & FUTEX_32))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ futexv[i].w.flags = aux.flags;
|
||||
+ futexv[i].w.val = aux.val;
|
||||
+ futexv[i].w.uaddr = aux.uaddr;
|
||||
+ futexv[i].q = futex_q_init;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * unqueue_multiple - Remove various futexes from their hash bucket
|
||||
+ * @v: The list of futexes to unqueue
|
||||
+ * @count: Number of futexes in the list
|
||||
+ *
|
||||
+ * Helper to unqueue a list of futexes. This can't fail.
|
||||
+ *
|
||||
+ * Return:
|
||||
+ * - >=0 - Index of the last futex that was awoken;
|
||||
+ * - -1 - No futex was awoken
|
||||
+ */
|
||||
+static int unqueue_multiple(struct futex_vector *v, int count)
|
||||
+{
|
||||
+ int ret = -1, i;
|
||||
+
|
||||
+ for (i = 0; i < count; i++) {
|
||||
+ if (!unqueue_me(&v[i].q))
|
||||
+ ret = i;
|
||||
+ }
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
|
||||
+ * @vs: The futex list to wait on
|
||||
+ * @count: The size of the list
|
||||
+ * @woken: Index of the last woken futex, if any. Used to notify the
|
||||
+ * caller that it can return this index to userspace (return parameter)
|
||||
+ *
|
||||
+ * Prepare multiple futexes in a single step and enqueue them. This may fail if
|
||||
+ * the futex list is invalid or if any futex was already awoken. On success the
|
||||
+ * task is ready to interruptible sleep.
|
||||
+ *
|
||||
+ * Return:
|
||||
+ * - 1 - One of the futexes was woken by another thread
|
||||
+ * - 0 - Success
|
||||
+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
|
||||
+ */
|
||||
+static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
|
||||
+{
|
||||
+ struct futex_hash_bucket *hb;
|
||||
+ bool retry = false;
|
||||
+ int ret, i;
|
||||
+ u32 uval;
|
||||
+
|
||||
+ /*
|
||||
+ * Enqueuing multiple futexes is tricky, because we need to enqueue
|
||||
+ * each futex on the list before dealing with the next one to avoid
|
||||
+ * deadlocking on the hash bucket. But, before enqueuing, we need to
|
||||
+ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
|
||||
+ * lose any wake events, which cannot be done before the get_futex_key
|
||||
+ * of the next key, because it calls get_user_pages, which can sleep.
|
||||
+ * Thus, we fetch the list of futexes keys in two steps, by first
|
||||
+ * pinning all the memory keys in the futex key, and only then we read
|
||||
+ * each key and queue the corresponding futex.
|
||||
+ *
|
||||
+ * Private futexes doesn't need to recalculate hash in retry, so skip
|
||||
+ * get_futex_key() when retrying.
|
||||
+ */
|
||||
+retry:
|
||||
+ for (i = 0; i < count; i++) {
|
||||
+ if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
|
||||
+ continue;
|
||||
+
|
||||
+ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
|
||||
+ !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
|
||||
+ &vs[i].q.key, FUTEX_READ);
|
||||
+
|
||||
+ if (unlikely(ret))
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ set_current_state(TASK_INTERRUPTIBLE);
|
||||
+
|
||||
+ for (i = 0; i < count; i++) {
|
||||
+ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
|
||||
+ struct futex_q *q = &vs[i].q;
|
||||
+ u32 val = (u32)vs[i].w.val;
|
||||
+
|
||||
+ hb = queue_lock(q);
|
||||
+ ret = get_futex_value_locked(&uval, uaddr);
|
||||
+
|
||||
+ if (!ret && uval == val) {
|
||||
+ /*
|
||||
+ * The bucket lock can't be held while dealing with the
|
||||
+ * next futex. Queue each futex at this moment so hb can
|
||||
+ * be unlocked.
|
||||
+ */
|
||||
+ queue_me(q, hb);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ queue_unlock(hb);
|
||||
+ __set_current_state(TASK_RUNNING);
|
||||
+
|
||||
+ /*
|
||||
+ * Even if something went wrong, if we find out that a futex
|
||||
+ * was woken, we don't return error and return this index to
|
||||
+ * userspace
|
||||
+ */
|
||||
+ *woken = unqueue_multiple(vs, i);
|
||||
+ if (*woken >= 0)
|
||||
+ return 1;
|
||||
+
|
||||
+ if (ret) {
|
||||
+ /*
|
||||
+ * If we need to handle a page fault, we need to do so
|
||||
+ * without any lock and any enqueued futex (otherwise
|
||||
+ * we could lose some wakeup). So we do it here, after
|
||||
+ * undoing all the work done so far. In success, we
|
||||
+ * retry all the work.
|
||||
+ */
|
||||
+ if (get_user(uval, uaddr))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ retry = true;
|
||||
+ goto retry;
|
||||
+ }
|
||||
+
|
||||
+ if (uval != val)
|
||||
+ return -EWOULDBLOCK;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * futex_sleep_multiple - Check sleeping conditions and sleep
|
||||
+ * @vs: List of futexes to wait for
|
||||
+ * @count: Length of vs
|
||||
+ * @to: Timeout
|
||||
+ *
|
||||
+ * Sleep if and only if the timeout hasn't expired and no futex on the list has
|
||||
+ * been woken up.
|
||||
+ */
|
||||
+static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
|
||||
+ struct hrtimer_sleeper *to)
|
||||
+{
|
||||
+ if (to && !to->task)
|
||||
+ return;
|
||||
+
|
||||
+ for (; count; count--, vs++) {
|
||||
+ if (!READ_ONCE(vs->q.lock_ptr))
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ freezable_schedule();
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * futex_wait_multiple - Prepare to wait on and enqueue several futexes
|
||||
+ * @vs: The list of futexes to wait on
|
||||
+ * @count: The number of objects
|
||||
+ * @to: Timeout before giving up and returning to userspace
|
||||
+ *
|
||||
+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
|
||||
+ * sleeps on a group of futexes and returns on the first futex that is
|
||||
+ * wake, or after the timeout has elapsed.
|
||||
+ *
|
||||
+ * Return:
|
||||
+ * - >=0 - Hint to the futex that was awoken
|
||||
+ * - <0 - On error
|
||||
+ */
|
||||
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
|
||||
+ struct hrtimer_sleeper *to)
|
||||
+{
|
||||
+ int ret, hint = 0;
|
||||
+
|
||||
+ if (to)
|
||||
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
|
||||
+
|
||||
+ while (1) {
|
||||
+ ret = futex_wait_multiple_setup(vs, count, &hint);
|
||||
+ if (ret) {
|
||||
+ if (ret > 0) {
|
||||
+ /* A futex was woken during setup */
|
||||
+ ret = hint;
|
||||
+ }
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ futex_sleep_multiple(vs, count, to);
|
||||
+
|
||||
+ __set_current_state(TASK_RUNNING);
|
||||
+
|
||||
+ ret = unqueue_multiple(vs, count);
|
||||
+ if (ret >= 0)
|
||||
+ return ret;
|
||||
+
|
||||
+ if (to && !to->task)
|
||||
+ return -ETIMEDOUT;
|
||||
+ else if (signal_pending(current))
|
||||
+ return -ERESTARTSYS;
|
||||
+ /*
|
||||
+ * The final case is a spurious wakeup, for
|
||||
+ * which just retry.
|
||||
+ */
|
||||
+ }
|
||||
+}
|
||||
+/* Mask of available flags for each futex in futex_waitv list */
|
||||
+#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
|
||||
+
|
||||
+/**
|
||||
+ * sys_futex_waitv - Wait on a list of futexes
|
||||
+ * @waiters: List of futexes to wait on
|
||||
+ * @nr_futexes: Length of futexv
|
||||
+ * @flags: Flag for timeout (monotonic/realtime)
|
||||
+ * @timeout: Optional absolute timeout.
|
||||
+ * @clockid: Clock to be used for the timeout, realtime or monotonic.
|
||||
+ *
|
||||
+ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
|
||||
+ * if a futex_wake() is performed at any uaddr. The syscall returns immediately
|
||||
+ * if any waiter has *uaddr != val. *timeout is an optional timeout value for
|
||||
+ * the operation. Each waiter has individual flags. The `flags` argument for
|
||||
+ * the syscall should be used solely for specifying the timeout as realtime, if
|
||||
+ * needed. Flags for private futexes, sizes, etc. should be used on the
|
||||
+ * individual flags of each waiter.
|
||||
+ *
|
||||
+ * Returns the array index of one of the woken futexes. No further information
|
||||
+ * is provided: any number of other futexes may also have been woken by the
|
||||
+ * same event, and if more than one futex was woken, the retrned index may
|
||||
+ * refer to any one of them. (It is not necessaryily the futex with the
|
||||
+ * smallest index, nor the one most recently woken, nor...)
|
||||
+ */
|
||||
+
|
||||
+SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
|
||||
+ unsigned int, nr_futexes, unsigned int, flags,
|
||||
+ struct __kernel_timespec __user *, timeout, clockid_t, clockid)
|
||||
+{
|
||||
+ struct hrtimer_sleeper to;
|
||||
+ struct futex_vector *futexv;
|
||||
+ struct timespec64 ts;
|
||||
+ ktime_t time;
|
||||
+ int ret;
|
||||
+
|
||||
+ /* This syscall supports no flags for now */
|
||||
+ if (flags)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (timeout) {
|
||||
+ int flag_clkid = 0, flag_init = 0;
|
||||
+
|
||||
+ if (clockid == CLOCK_REALTIME) {
|
||||
+ flag_clkid = FLAGS_CLOCKRT;
|
||||
+ flag_init = FUTEX_CLOCK_REALTIME;
|
||||
+ }
|
||||
+
|
||||
+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (get_timespec64(&ts, timeout))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ /*
|
||||
+ * Since there's no opcode for futex_waitv, use
|
||||
+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
|
||||
+ */
|
||||
+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+
|
||||
+ futex_setup_timer(&time, &to, flag_clkid, 0);
|
||||
+ }
|
||||
+
|
||||
+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
|
||||
+ if (!futexv)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ ret = futex_parse_waitv(futexv, waiters, nr_futexes);
|
||||
+ if (!ret)
|
||||
+ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
|
||||
+
|
||||
+ if (timeout) {
|
||||
+ hrtimer_cancel(&to.timer);
|
||||
+ destroy_hrtimer_on_stack(&to.timer);
|
||||
+ }
|
||||
+
|
||||
+ kfree(futexv);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_COMPAT_32BIT_TIME
|
||||
SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
|
||||
const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
|
||||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
|
||||
index 0ea8128468c3..0979fac9414d 100644
|
||||
--- a/kernel/sys_ni.c
|
||||
+++ b/kernel/sys_ni.c
|
||||
@@ -150,6 +150,7 @@ COND_SYSCALL(set_robust_list);
|
||||
COND_SYSCALL_COMPAT(set_robust_list);
|
||||
COND_SYSCALL(get_robust_list);
|
||||
COND_SYSCALL_COMPAT(get_robust_list);
|
||||
+COND_SYSCALL(futex_waitv);
|
||||
|
||||
/* kernel/hrtimer.c */
|
||||
|
||||
--
|
||||
2.33.1
|
||||
|
||||
From 4e40f3886e134f33c50ca79bc8b323cea784bd78 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
|
||||
Date: Thu, 23 Sep 2021 14:11:06 -0300
|
||||
Subject: [PATCH 2/2] futex,x86: Wire up sys_futex_waitv()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Wire up syscall entry point for x86 arch, for both i386 and x86_64.
|
||||
|
||||
Signed-off-by: André Almeida <andrealmeid@collabora.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Link: https://lore.kernel.org/r/20210923171111.300673-18-andrealmeid@collabora.com
|
||||
---
|
||||
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
|
||||
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
|
||||
2 files changed, 2 insertions(+)
|
||||
|
||||
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
index 4bbc267fb36b..b2b9b9df1355 100644
|
||||
--- a/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
@@ -451,3 +451,4 @@
|
||||
444 i386 landlock_create_ruleset sys_landlock_create_ruleset
|
||||
445 i386 landlock_add_rule sys_landlock_add_rule
|
||||
446 i386 landlock_restrict_self sys_landlock_restrict_self
|
||||
+449 i386 futex_waitv sys_futex_waitv
|
||||
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
index ce18119ea0d0..bfd4e8f5be34 100644
|
||||
--- a/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
@@ -368,6 +368,7 @@
|
||||
444 common landlock_create_ruleset sys_landlock_create_ruleset
|
||||
445 common landlock_add_rule sys_landlock_add_rule
|
||||
446 common landlock_restrict_self sys_landlock_restrict_self
|
||||
+449 common futex_waitv sys_futex_waitv
|
||||
|
||||
#
|
||||
# Due to a historical design error, certain syscalls are numbered differently
|
||||
--
|
||||
2.33.1
|
||||
|
536
linux-tkg-patches/5.14/0007-v5.14-futex_waitv.patch
Normal file
536
linux-tkg-patches/5.14/0007-v5.14-futex_waitv.patch
Normal file
@@ -0,0 +1,536 @@
|
||||
From 4901e29e3c0237c52eadd2c82deb9bd6e7add5ac Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
|
||||
Date: Thu, 23 Sep 2021 14:11:05 -0300
|
||||
Subject: [PATCH 1/2] futex: Implement sys_futex_waitv()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add support to wait on multiple futexes. This is the interface
|
||||
implemented by this syscall:
|
||||
|
||||
futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes,
|
||||
unsigned int flags, struct timespec *timeout, clockid_t clockid)
|
||||
|
||||
struct futex_waitv {
|
||||
__u64 val;
|
||||
__u64 uaddr;
|
||||
__u32 flags;
|
||||
__u32 __reserved;
|
||||
};
|
||||
|
||||
Given an array of struct futex_waitv, wait on each uaddr. The thread
|
||||
wakes if a futex_wake() is performed at any uaddr. The syscall returns
|
||||
immediately if any waiter has *uaddr != val. *timeout is an optional
|
||||
absolute timeout value for the operation. This syscall supports only
|
||||
64bit sized timeout structs. The flags argument of the syscall should be
|
||||
empty, but it can be used for future extensions. Flags for shared
|
||||
futexes, sizes, etc. should be used on the individual flags of each
|
||||
waiter.
|
||||
|
||||
__reserved is used for explicit padding and should be 0, but it might be
|
||||
used for future extensions. If the userspace uses 32-bit pointers, it
|
||||
should make sure to explicitly cast it when assigning to waitv::uaddr.
|
||||
|
||||
Returns the array index of one of the woken futexes. There’s no given
|
||||
information of how many were woken, or any particular attribute of it
|
||||
(if it’s the first woken, if it is of the smaller index...).
|
||||
|
||||
Signed-off-by: André Almeida <andrealmeid@collabora.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Link: https://lore.kernel.org/r/20210923171111.300673-17-andrealmeid@collabora.com
|
||||
---
|
||||
include/linux/syscalls.h | 6 +
|
||||
include/uapi/asm-generic/unistd.h | 5 +-
|
||||
include/uapi/linux/futex.h | 26 +++
|
||||
kernel/futex.c | 334 ++++++++++++++++++++++++++++++
|
||||
kernel/sys_ni.c | 1 +
|
||||
5 files changed, 371 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
|
||||
index 050511e8f1f8..8390482cf082 100644
|
||||
--- a/include/linux/syscalls.h
|
||||
+++ b/include/linux/syscalls.h
|
||||
@@ -58,6 +58,7 @@ struct mq_attr;
|
||||
struct compat_stat;
|
||||
struct old_timeval32;
|
||||
struct robust_list_head;
|
||||
+struct futex_waitv;
|
||||
struct getcpu_cache;
|
||||
struct old_linux_dirent;
|
||||
struct perf_event_attr;
|
||||
@@ -623,6 +624,11 @@ asmlinkage long sys_get_robust_list(int pid,
|
||||
asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
|
||||
size_t len);
|
||||
|
||||
+asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
|
||||
+ unsigned int nr_futexes, unsigned int flags,
|
||||
+ struct __kernel_timespec __user *timeout, clockid_t clockid);
|
||||
+
|
||||
+
|
||||
/* kernel/hrtimer.c */
|
||||
asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
|
||||
struct __kernel_timespec __user *rmtp);
|
||||
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
|
||||
index d2a942086fcb..3f55ac23cea9 100644
|
||||
--- a/include/uapi/asm-generic/unistd.h
|
||||
+++ b/include/uapi/asm-generic/unistd.h
|
||||
@@ -878,8 +878,11 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
|
||||
__SYSCALL(__NR_memfd_secret, sys_memfd_secret)
|
||||
#endif
|
||||
|
||||
+#define __NR_futex_waitv 449
|
||||
+__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
|
||||
+
|
||||
#undef __NR_syscalls
|
||||
-#define __NR_syscalls 448
|
||||
+#define __NR_syscalls 450
|
||||
|
||||
/*
|
||||
* 32 bit systems traditionally used different
|
||||
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
|
||||
index a89eb0accd5e..1666f5e4b837 100644
|
||||
--- a/include/uapi/linux/futex.h
|
||||
+++ b/include/uapi/linux/futex.h
|
||||
@@ -41,6 +41,32 @@
|
||||
#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
|
||||
FUTEX_PRIVATE_FLAG)
|
||||
|
||||
+ /*
|
||||
+ * Flags to specify the bit length of the futex word for futex2 syscalls.
|
||||
+ * Currently, only 32 is supported.
|
||||
+ */
|
||||
+#define FUTEX_32 2
|
||||
+
|
||||
+/*
|
||||
+ * Max numbers of elements in a futex_waitv array
|
||||
+ */
|
||||
+#define FUTEX_WAITV_MAX 128
|
||||
+
|
||||
+/**
|
||||
+ * struct futex_waitv - A waiter for vectorized wait
|
||||
+ * @val: Expected value at uaddr
|
||||
+ * @uaddr: User address to wait on
|
||||
+ * @flags: Flags for this waiter
|
||||
+ * @__reserved: Reserved member to preserve data alignment. Should be 0.
|
||||
+ */
|
||||
+struct futex_waitv {
|
||||
+ __u64 val;
|
||||
+ __u64 uaddr;
|
||||
+ __u32 flags;
|
||||
+ __u32 __reserved;
|
||||
+};
|
||||
+
|
||||
+
|
||||
/*
|
||||
* Support for robust futexes: the kernel cleans up held futexes at
|
||||
* thread exit time.
|
||||
diff --git a/kernel/futex.c b/kernel/futex.c
|
||||
index 408cad5e8968..d7dc0bd9379c 100644
|
||||
--- a/kernel/futex.c
|
||||
+++ b/kernel/futex.c
|
||||
@@ -227,6 +227,18 @@ static const struct futex_q futex_q_init = {
|
||||
.bitset = FUTEX_BITSET_MATCH_ANY
|
||||
};
|
||||
|
||||
+/**
|
||||
+ * struct futex_vector - Auxiliary struct for futex_waitv()
|
||||
+ * @w: Userspace provided data
|
||||
+ * @q: Kernel side data
|
||||
+ *
|
||||
+ * Struct used to build an array with all data need for futex_waitv()
|
||||
+ */
|
||||
+struct futex_vector {
|
||||
+ struct futex_waitv w;
|
||||
+ struct futex_q q;
|
||||
+};
|
||||
+
|
||||
/*
|
||||
* Hash buckets are shared by all the futex_keys that hash to the same
|
||||
* location. Each key may have multiple futex_q structures, one for each task
|
||||
@@ -3962,6 +3974,328 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
|
||||
}
|
||||
#endif /* CONFIG_COMPAT */
|
||||
|
||||
+/* Mask of available flags for each futex in futex_waitv list */
|
||||
+#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
|
||||
+
|
||||
+/**
|
||||
+ * futex_parse_waitv - Parse a waitv array from userspace
|
||||
+ * @futexv: Kernel side list of waiters to be filled
|
||||
+ * @uwaitv: Userspace list to be parsed
|
||||
+ * @nr_futexes: Length of futexv
|
||||
+ *
|
||||
+ * Return: Error code on failure, 0 on success
|
||||
+ */
|
||||
+static int futex_parse_waitv(struct futex_vector *futexv,
|
||||
+ struct futex_waitv __user *uwaitv,
|
||||
+ unsigned int nr_futexes)
|
||||
+{
|
||||
+ struct futex_waitv aux;
|
||||
+ unsigned int i;
|
||||
+
|
||||
+ for (i = 0; i < nr_futexes; i++) {
|
||||
+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!(aux.flags & FUTEX_32))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ futexv[i].w.flags = aux.flags;
|
||||
+ futexv[i].w.val = aux.val;
|
||||
+ futexv[i].w.uaddr = aux.uaddr;
|
||||
+ futexv[i].q = futex_q_init;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * unqueue_multiple - Remove various futexes from their hash bucket
|
||||
+ * @v: The list of futexes to unqueue
|
||||
+ * @count: Number of futexes in the list
|
||||
+ *
|
||||
+ * Helper to unqueue a list of futexes. This can't fail.
|
||||
+ *
|
||||
+ * Return:
|
||||
+ * - >=0 - Index of the last futex that was awoken;
|
||||
+ * - -1 - No futex was awoken
|
||||
+ */
|
||||
+static int unqueue_multiple(struct futex_vector *v, int count)
|
||||
+{
|
||||
+ int ret = -1, i;
|
||||
+
|
||||
+ for (i = 0; i < count; i++) {
|
||||
+ if (!unqueue_me(&v[i].q))
|
||||
+ ret = i;
|
||||
+ }
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
|
||||
+ * @vs: The futex list to wait on
|
||||
+ * @count: The size of the list
|
||||
+ * @woken: Index of the last woken futex, if any. Used to notify the
|
||||
+ * caller that it can return this index to userspace (return parameter)
|
||||
+ *
|
||||
+ * Prepare multiple futexes in a single step and enqueue them. This may fail if
|
||||
+ * the futex list is invalid or if any futex was already awoken. On success the
|
||||
+ * task is ready to interruptible sleep.
|
||||
+ *
|
||||
+ * Return:
|
||||
+ * - 1 - One of the futexes was woken by another thread
|
||||
+ * - 0 - Success
|
||||
+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
|
||||
+ */
|
||||
+static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
|
||||
+{
|
||||
+ struct futex_hash_bucket *hb;
|
||||
+ bool retry = false;
|
||||
+ int ret, i;
|
||||
+ u32 uval;
|
||||
+
|
||||
+ /*
|
||||
+ * Enqueuing multiple futexes is tricky, because we need to enqueue
|
||||
+ * each futex on the list before dealing with the next one to avoid
|
||||
+ * deadlocking on the hash bucket. But, before enqueuing, we need to
|
||||
+ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
|
||||
+ * lose any wake events, which cannot be done before the get_futex_key
|
||||
+ * of the next key, because it calls get_user_pages, which can sleep.
|
||||
+ * Thus, we fetch the list of futexes keys in two steps, by first
|
||||
+ * pinning all the memory keys in the futex key, and only then we read
|
||||
+ * each key and queue the corresponding futex.
|
||||
+ *
|
||||
+ * Private futexes doesn't need to recalculate hash in retry, so skip
|
||||
+ * get_futex_key() when retrying.
|
||||
+ */
|
||||
+retry:
|
||||
+ for (i = 0; i < count; i++) {
|
||||
+ if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
|
||||
+ continue;
|
||||
+
|
||||
+ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
|
||||
+ !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
|
||||
+ &vs[i].q.key, FUTEX_READ);
|
||||
+
|
||||
+ if (unlikely(ret))
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ set_current_state(TASK_INTERRUPTIBLE);
|
||||
+
|
||||
+ for (i = 0; i < count; i++) {
|
||||
+ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
|
||||
+ struct futex_q *q = &vs[i].q;
|
||||
+ u32 val = (u32)vs[i].w.val;
|
||||
+
|
||||
+ hb = queue_lock(q);
|
||||
+ ret = get_futex_value_locked(&uval, uaddr);
|
||||
+
|
||||
+ if (!ret && uval == val) {
|
||||
+ /*
|
||||
+ * The bucket lock can't be held while dealing with the
|
||||
+ * next futex. Queue each futex at this moment so hb can
|
||||
+ * be unlocked.
|
||||
+ */
|
||||
+ queue_me(q, hb);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ queue_unlock(hb);
|
||||
+ __set_current_state(TASK_RUNNING);
|
||||
+
|
||||
+ /*
|
||||
+ * Even if something went wrong, if we find out that a futex
|
||||
+ * was woken, we don't return error and return this index to
|
||||
+ * userspace
|
||||
+ */
|
||||
+ *woken = unqueue_multiple(vs, i);
|
||||
+ if (*woken >= 0)
|
||||
+ return 1;
|
||||
+
|
||||
+ if (ret) {
|
||||
+ /*
|
||||
+ * If we need to handle a page fault, we need to do so
|
||||
+ * without any lock and any enqueued futex (otherwise
|
||||
+ * we could lose some wakeup). So we do it here, after
|
||||
+ * undoing all the work done so far. In success, we
|
||||
+ * retry all the work.
|
||||
+ */
|
||||
+ if (get_user(uval, uaddr))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ retry = true;
|
||||
+ goto retry;
|
||||
+ }
|
||||
+
|
||||
+ if (uval != val)
|
||||
+ return -EWOULDBLOCK;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * futex_sleep_multiple - Check sleeping conditions and sleep
|
||||
+ * @vs: List of futexes to wait for
|
||||
+ * @count: Length of vs
|
||||
+ * @to: Timeout
|
||||
+ *
|
||||
+ * Sleep if and only if the timeout hasn't expired and no futex on the list has
|
||||
+ * been woken up.
|
||||
+ */
|
||||
+static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
|
||||
+ struct hrtimer_sleeper *to)
|
||||
+{
|
||||
+ if (to && !to->task)
|
||||
+ return;
|
||||
+
|
||||
+ for (; count; count--, vs++) {
|
||||
+ if (!READ_ONCE(vs->q.lock_ptr))
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ freezable_schedule();
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * futex_wait_multiple - Prepare to wait on and enqueue several futexes
|
||||
+ * @vs: The list of futexes to wait on
|
||||
+ * @count: The number of objects
|
||||
+ * @to: Timeout before giving up and returning to userspace
|
||||
+ *
|
||||
+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
|
||||
+ * sleeps on a group of futexes and returns on the first futex that is
|
||||
+ * wake, or after the timeout has elapsed.
|
||||
+ *
|
||||
+ * Return:
|
||||
+ * - >=0 - Hint to the futex that was awoken
|
||||
+ * - <0 - On error
|
||||
+ */
|
||||
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
|
||||
+ struct hrtimer_sleeper *to)
|
||||
+{
|
||||
+ int ret, hint = 0;
|
||||
+
|
||||
+ if (to)
|
||||
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
|
||||
+
|
||||
+ while (1) {
|
||||
+ ret = futex_wait_multiple_setup(vs, count, &hint);
|
||||
+ if (ret) {
|
||||
+ if (ret > 0) {
|
||||
+ /* A futex was woken during setup */
|
||||
+ ret = hint;
|
||||
+ }
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ futex_sleep_multiple(vs, count, to);
|
||||
+
|
||||
+ __set_current_state(TASK_RUNNING);
|
||||
+
|
||||
+ ret = unqueue_multiple(vs, count);
|
||||
+ if (ret >= 0)
|
||||
+ return ret;
|
||||
+
|
||||
+ if (to && !to->task)
|
||||
+ return -ETIMEDOUT;
|
||||
+ else if (signal_pending(current))
|
||||
+ return -ERESTARTSYS;
|
||||
+ /*
|
||||
+ * The final case is a spurious wakeup, for
|
||||
+ * which just retry.
|
||||
+ */
|
||||
+ }
|
||||
+}
|
||||
+/* Mask of available flags for each futex in futex_waitv list */
|
||||
+#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
|
||||
+
|
||||
+/**
|
||||
+ * sys_futex_waitv - Wait on a list of futexes
|
||||
+ * @waiters: List of futexes to wait on
|
||||
+ * @nr_futexes: Length of futexv
|
||||
+ * @flags: Flag for timeout (monotonic/realtime)
|
||||
+ * @timeout: Optional absolute timeout.
|
||||
+ * @clockid: Clock to be used for the timeout, realtime or monotonic.
|
||||
+ *
|
||||
+ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
|
||||
+ * if a futex_wake() is performed at any uaddr. The syscall returns immediately
|
||||
+ * if any waiter has *uaddr != val. *timeout is an optional timeout value for
|
||||
+ * the operation. Each waiter has individual flags. The `flags` argument for
|
||||
+ * the syscall should be used solely for specifying the timeout as realtime, if
|
||||
+ * needed. Flags for private futexes, sizes, etc. should be used on the
|
||||
+ * individual flags of each waiter.
|
||||
+ *
|
||||
+ * Returns the array index of one of the woken futexes. No further information
|
||||
+ * is provided: any number of other futexes may also have been woken by the
|
||||
+ * same event, and if more than one futex was woken, the retrned index may
|
||||
+ * refer to any one of them. (It is not necessaryily the futex with the
|
||||
+ * smallest index, nor the one most recently woken, nor...)
|
||||
+ */
|
||||
+
|
||||
+SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
|
||||
+ unsigned int, nr_futexes, unsigned int, flags,
|
||||
+ struct __kernel_timespec __user *, timeout, clockid_t, clockid)
|
||||
+{
|
||||
+ struct hrtimer_sleeper to;
|
||||
+ struct futex_vector *futexv;
|
||||
+ struct timespec64 ts;
|
||||
+ ktime_t time;
|
||||
+ int ret;
|
||||
+
|
||||
+ /* This syscall supports no flags for now */
|
||||
+ if (flags)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (timeout) {
|
||||
+ int flag_clkid = 0, flag_init = 0;
|
||||
+
|
||||
+ if (clockid == CLOCK_REALTIME) {
|
||||
+ flag_clkid = FLAGS_CLOCKRT;
|
||||
+ flag_init = FUTEX_CLOCK_REALTIME;
|
||||
+ }
|
||||
+
|
||||
+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (get_timespec64(&ts, timeout))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ /*
|
||||
+ * Since there's no opcode for futex_waitv, use
|
||||
+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
|
||||
+ */
|
||||
+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+
|
||||
+ futex_setup_timer(&time, &to, flag_clkid, 0);
|
||||
+ }
|
||||
+
|
||||
+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
|
||||
+ if (!futexv)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ ret = futex_parse_waitv(futexv, waiters, nr_futexes);
|
||||
+ if (!ret)
|
||||
+ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
|
||||
+
|
||||
+ if (timeout) {
|
||||
+ hrtimer_cancel(&to.timer);
|
||||
+ destroy_hrtimer_on_stack(&to.timer);
|
||||
+ }
|
||||
+
|
||||
+ kfree(futexv);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_COMPAT_32BIT_TIME
|
||||
SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
|
||||
const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
|
||||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
|
||||
index 0ea8128468c3..0979fac9414d 100644
|
||||
--- a/kernel/sys_ni.c
|
||||
+++ b/kernel/sys_ni.c
|
||||
@@ -150,6 +150,7 @@ COND_SYSCALL(set_robust_list);
|
||||
COND_SYSCALL_COMPAT(set_robust_list);
|
||||
COND_SYSCALL(get_robust_list);
|
||||
COND_SYSCALL_COMPAT(get_robust_list);
|
||||
+COND_SYSCALL(futex_waitv);
|
||||
|
||||
/* kernel/hrtimer.c */
|
||||
|
||||
--
|
||||
2.33.1
|
||||
|
||||
From 4e40f3886e134f33c50ca79bc8b323cea784bd78 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
|
||||
Date: Thu, 23 Sep 2021 14:11:06 -0300
|
||||
Subject: [PATCH 2/2] futex,x86: Wire up sys_futex_waitv()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Wire up syscall entry point for x86 arch, for both i386 and x86_64.
|
||||
|
||||
Signed-off-by: André Almeida <andrealmeid@collabora.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Link: https://lore.kernel.org/r/20210923171111.300673-18-andrealmeid@collabora.com
|
||||
---
|
||||
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
|
||||
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
|
||||
2 files changed, 2 insertions(+)
|
||||
|
||||
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
index 4bbc267fb36b..b2b9b9df1355 100644
|
||||
--- a/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
@@ -452,3 +452,4 @@
|
||||
445 i386 landlock_add_rule sys_landlock_add_rule
|
||||
446 i386 landlock_restrict_self sys_landlock_restrict_self
|
||||
447 i386 memfd_secret sys_memfd_secret
|
||||
+449 i386 futex_waitv sys_futex_waitv
|
||||
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
index ce18119ea0d0..bfd4e8f5be34 100644
|
||||
--- a/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
@@ -369,6 +369,7 @@
|
||||
445 common landlock_add_rule sys_landlock_add_rule
|
||||
446 common landlock_restrict_self sys_landlock_restrict_self
|
||||
447 common memfd_secret sys_memfd_secret
|
||||
+449 common futex_waitv sys_futex_waitv
|
||||
|
||||
#
|
||||
# Due to a historical design error, certain syscalls are numbered differently
|
||||
--
|
||||
2.33.1
|
||||
|
536
linux-tkg-patches/5.15/0007-v5.15-futex_waitv.patch
Normal file
536
linux-tkg-patches/5.15/0007-v5.15-futex_waitv.patch
Normal file
@@ -0,0 +1,536 @@
|
||||
From 4901e29e3c0237c52eadd2c82deb9bd6e7add5ac Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
|
||||
Date: Thu, 23 Sep 2021 14:11:05 -0300
|
||||
Subject: [PATCH 1/2] futex: Implement sys_futex_waitv()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add support to wait on multiple futexes. This is the interface
|
||||
implemented by this syscall:
|
||||
|
||||
futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes,
|
||||
unsigned int flags, struct timespec *timeout, clockid_t clockid)
|
||||
|
||||
struct futex_waitv {
|
||||
__u64 val;
|
||||
__u64 uaddr;
|
||||
__u32 flags;
|
||||
__u32 __reserved;
|
||||
};
|
||||
|
||||
Given an array of struct futex_waitv, wait on each uaddr. The thread
|
||||
wakes if a futex_wake() is performed at any uaddr. The syscall returns
|
||||
immediately if any waiter has *uaddr != val. *timeout is an optional
|
||||
absolute timeout value for the operation. This syscall supports only
|
||||
64bit sized timeout structs. The flags argument of the syscall should be
|
||||
empty, but it can be used for future extensions. Flags for shared
|
||||
futexes, sizes, etc. should be used on the individual flags of each
|
||||
waiter.
|
||||
|
||||
__reserved is used for explicit padding and should be 0, but it might be
|
||||
used for future extensions. If the userspace uses 32-bit pointers, it
|
||||
should make sure to explicitly cast it when assigning to waitv::uaddr.
|
||||
|
||||
Returns the array index of one of the woken futexes. There’s no given
|
||||
information of how many were woken, or any particular attribute of it
|
||||
(if it’s the first woken, if it is of the smaller index...).
|
||||
|
||||
Signed-off-by: André Almeida <andrealmeid@collabora.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Link: https://lore.kernel.org/r/20210923171111.300673-17-andrealmeid@collabora.com
|
||||
---
|
||||
include/linux/syscalls.h | 6 +
|
||||
include/uapi/asm-generic/unistd.h | 5 +-
|
||||
include/uapi/linux/futex.h | 26 +++
|
||||
kernel/futex.c | 334 ++++++++++++++++++++++++++++++
|
||||
kernel/sys_ni.c | 1 +
|
||||
5 files changed, 371 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
|
||||
index 050511e8f1f8..8390482cf082 100644
|
||||
--- a/include/linux/syscalls.h
|
||||
+++ b/include/linux/syscalls.h
|
||||
@@ -58,6 +58,7 @@ struct mq_attr;
|
||||
struct compat_stat;
|
||||
struct old_timeval32;
|
||||
struct robust_list_head;
|
||||
+struct futex_waitv;
|
||||
struct getcpu_cache;
|
||||
struct old_linux_dirent;
|
||||
struct perf_event_attr;
|
||||
@@ -623,6 +624,11 @@ asmlinkage long sys_get_robust_list(int pid,
|
||||
asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
|
||||
size_t len);
|
||||
|
||||
+asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
|
||||
+ unsigned int nr_futexes, unsigned int flags,
|
||||
+ struct __kernel_timespec __user *timeout, clockid_t clockid);
|
||||
+
|
||||
+
|
||||
/* kernel/hrtimer.c */
|
||||
asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
|
||||
struct __kernel_timespec __user *rmtp);
|
||||
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
|
||||
index d2a942086fcb..3f55ac23cea9 100644
|
||||
--- a/include/uapi/asm-generic/unistd.h
|
||||
+++ b/include/uapi/asm-generic/unistd.h
|
||||
@@ -880,8 +880,11 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
|
||||
#define __NR_process_mrelease 448
|
||||
__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
|
||||
|
||||
+#define __NR_futex_waitv 449
|
||||
+__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
|
||||
+
|
||||
#undef __NR_syscalls
|
||||
-#define __NR_syscalls 449
|
||||
+#define __NR_syscalls 450
|
||||
|
||||
/*
|
||||
* 32 bit systems traditionally used different
|
||||
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
|
||||
index a89eb0accd5e..1666f5e4b837 100644
|
||||
--- a/include/uapi/linux/futex.h
|
||||
+++ b/include/uapi/linux/futex.h
|
||||
@@ -41,6 +41,32 @@
|
||||
#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
|
||||
FUTEX_PRIVATE_FLAG)
|
||||
|
||||
+ /*
|
||||
+ * Flags to specify the bit length of the futex word for futex2 syscalls.
|
||||
+ * Currently, only 32 is supported.
|
||||
+ */
|
||||
+#define FUTEX_32 2
|
||||
+
|
||||
+/*
|
||||
+ * Max numbers of elements in a futex_waitv array
|
||||
+ */
|
||||
+#define FUTEX_WAITV_MAX 128
|
||||
+
|
||||
+/**
|
||||
+ * struct futex_waitv - A waiter for vectorized wait
|
||||
+ * @val: Expected value at uaddr
|
||||
+ * @uaddr: User address to wait on
|
||||
+ * @flags: Flags for this waiter
|
||||
+ * @__reserved: Reserved member to preserve data alignment. Should be 0.
|
||||
+ */
|
||||
+struct futex_waitv {
|
||||
+ __u64 val;
|
||||
+ __u64 uaddr;
|
||||
+ __u32 flags;
|
||||
+ __u32 __reserved;
|
||||
+};
|
||||
+
|
||||
+
|
||||
/*
|
||||
* Support for robust futexes: the kernel cleans up held futexes at
|
||||
* thread exit time.
|
||||
diff --git a/kernel/futex.c b/kernel/futex.c
|
||||
index 408cad5e8968..d7dc0bd9379c 100644
|
||||
--- a/kernel/futex.c
|
||||
+++ b/kernel/futex.c
|
||||
@@ -285,6 +285,18 @@ static const struct futex_q futex_q_init = {
|
||||
.requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
|
||||
};
|
||||
|
||||
+/**
|
||||
+ * struct futex_vector - Auxiliary struct for futex_waitv()
|
||||
+ * @w: Userspace provided data
|
||||
+ * @q: Kernel side data
|
||||
+ *
|
||||
+ * Struct used to build an array with all data need for futex_waitv()
|
||||
+ */
|
||||
+struct futex_vector {
|
||||
+ struct futex_waitv w;
|
||||
+ struct futex_q q;
|
||||
+};
|
||||
+
|
||||
/*
|
||||
* Hash buckets are shared by all the futex_keys that hash to the same
|
||||
* location. Each key may have multiple futex_q structures, one for each task
|
||||
@@ -3962,6 +3974,328 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
|
||||
}
|
||||
#endif /* CONFIG_COMPAT */
|
||||
|
||||
+/* Mask of available flags for each futex in futex_waitv list */
|
||||
+#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
|
||||
+
|
||||
+/**
|
||||
+ * futex_parse_waitv - Parse a waitv array from userspace
|
||||
+ * @futexv: Kernel side list of waiters to be filled
|
||||
+ * @uwaitv: Userspace list to be parsed
|
||||
+ * @nr_futexes: Length of futexv
|
||||
+ *
|
||||
+ * Return: Error code on failure, 0 on success
|
||||
+ */
|
||||
+static int futex_parse_waitv(struct futex_vector *futexv,
|
||||
+ struct futex_waitv __user *uwaitv,
|
||||
+ unsigned int nr_futexes)
|
||||
+{
|
||||
+ struct futex_waitv aux;
|
||||
+ unsigned int i;
|
||||
+
|
||||
+ for (i = 0; i < nr_futexes; i++) {
|
||||
+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!(aux.flags & FUTEX_32))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ futexv[i].w.flags = aux.flags;
|
||||
+ futexv[i].w.val = aux.val;
|
||||
+ futexv[i].w.uaddr = aux.uaddr;
|
||||
+ futexv[i].q = futex_q_init;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * unqueue_multiple - Remove various futexes from their hash bucket
|
||||
+ * @v: The list of futexes to unqueue
|
||||
+ * @count: Number of futexes in the list
|
||||
+ *
|
||||
+ * Helper to unqueue a list of futexes. This can't fail.
|
||||
+ *
|
||||
+ * Return:
|
||||
+ * - >=0 - Index of the last futex that was awoken;
|
||||
+ * - -1 - No futex was awoken
|
||||
+ */
|
||||
+static int unqueue_multiple(struct futex_vector *v, int count)
|
||||
+{
|
||||
+ int ret = -1, i;
|
||||
+
|
||||
+ for (i = 0; i < count; i++) {
|
||||
+ if (!unqueue_me(&v[i].q))
|
||||
+ ret = i;
|
||||
+ }
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
|
||||
+ * @vs: The futex list to wait on
|
||||
+ * @count: The size of the list
|
||||
+ * @woken: Index of the last woken futex, if any. Used to notify the
|
||||
+ * caller that it can return this index to userspace (return parameter)
|
||||
+ *
|
||||
+ * Prepare multiple futexes in a single step and enqueue them. This may fail if
|
||||
+ * the futex list is invalid or if any futex was already awoken. On success the
|
||||
+ * task is ready to interruptible sleep.
|
||||
+ *
|
||||
+ * Return:
|
||||
+ * - 1 - One of the futexes was woken by another thread
|
||||
+ * - 0 - Success
|
||||
+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
|
||||
+ */
|
||||
+static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
|
||||
+{
|
||||
+ struct futex_hash_bucket *hb;
|
||||
+ bool retry = false;
|
||||
+ int ret, i;
|
||||
+ u32 uval;
|
||||
+
|
||||
+ /*
|
||||
+ * Enqueuing multiple futexes is tricky, because we need to enqueue
|
||||
+ * each futex on the list before dealing with the next one to avoid
|
||||
+ * deadlocking on the hash bucket. But, before enqueuing, we need to
|
||||
+ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
|
||||
+ * lose any wake events, which cannot be done before the get_futex_key
|
||||
+ * of the next key, because it calls get_user_pages, which can sleep.
|
||||
+ * Thus, we fetch the list of futexes keys in two steps, by first
|
||||
+ * pinning all the memory keys in the futex key, and only then we read
|
||||
+ * each key and queue the corresponding futex.
|
||||
+ *
|
||||
+ * Private futexes doesn't need to recalculate hash in retry, so skip
|
||||
+ * get_futex_key() when retrying.
|
||||
+ */
|
||||
+retry:
|
||||
+ for (i = 0; i < count; i++) {
|
||||
+ if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
|
||||
+ continue;
|
||||
+
|
||||
+ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
|
||||
+ !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
|
||||
+ &vs[i].q.key, FUTEX_READ);
|
||||
+
|
||||
+ if (unlikely(ret))
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ set_current_state(TASK_INTERRUPTIBLE);
|
||||
+
|
||||
+ for (i = 0; i < count; i++) {
|
||||
+ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
|
||||
+ struct futex_q *q = &vs[i].q;
|
||||
+ u32 val = (u32)vs[i].w.val;
|
||||
+
|
||||
+ hb = queue_lock(q);
|
||||
+ ret = get_futex_value_locked(&uval, uaddr);
|
||||
+
|
||||
+ if (!ret && uval == val) {
|
||||
+ /*
|
||||
+ * The bucket lock can't be held while dealing with the
|
||||
+ * next futex. Queue each futex at this moment so hb can
|
||||
+ * be unlocked.
|
||||
+ */
|
||||
+ queue_me(q, hb);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ queue_unlock(hb);
|
||||
+ __set_current_state(TASK_RUNNING);
|
||||
+
|
||||
+ /*
|
||||
+ * Even if something went wrong, if we find out that a futex
|
||||
+ * was woken, we don't return error and return this index to
|
||||
+ * userspace
|
||||
+ */
|
||||
+ *woken = unqueue_multiple(vs, i);
|
||||
+ if (*woken >= 0)
|
||||
+ return 1;
|
||||
+
|
||||
+ if (ret) {
|
||||
+ /*
|
||||
+ * If we need to handle a page fault, we need to do so
|
||||
+ * without any lock and any enqueued futex (otherwise
|
||||
+ * we could lose some wakeup). So we do it here, after
|
||||
+ * undoing all the work done so far. In success, we
|
||||
+ * retry all the work.
|
||||
+ */
|
||||
+ if (get_user(uval, uaddr))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ retry = true;
|
||||
+ goto retry;
|
||||
+ }
|
||||
+
|
||||
+ if (uval != val)
|
||||
+ return -EWOULDBLOCK;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * futex_sleep_multiple - Check sleeping conditions and sleep
|
||||
+ * @vs: List of futexes to wait for
|
||||
+ * @count: Length of vs
|
||||
+ * @to: Timeout
|
||||
+ *
|
||||
+ * Sleep if and only if the timeout hasn't expired and no futex on the list has
|
||||
+ * been woken up.
|
||||
+ */
|
||||
+static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
|
||||
+ struct hrtimer_sleeper *to)
|
||||
+{
|
||||
+ if (to && !to->task)
|
||||
+ return;
|
||||
+
|
||||
+ for (; count; count--, vs++) {
|
||||
+ if (!READ_ONCE(vs->q.lock_ptr))
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ freezable_schedule();
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * futex_wait_multiple - Prepare to wait on and enqueue several futexes
|
||||
+ * @vs: The list of futexes to wait on
|
||||
+ * @count: The number of objects
|
||||
+ * @to: Timeout before giving up and returning to userspace
|
||||
+ *
|
||||
+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
|
||||
+ * sleeps on a group of futexes and returns on the first futex that is
|
||||
+ * wake, or after the timeout has elapsed.
|
||||
+ *
|
||||
+ * Return:
|
||||
+ * - >=0 - Hint to the futex that was awoken
|
||||
+ * - <0 - On error
|
||||
+ */
|
||||
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
|
||||
+ struct hrtimer_sleeper *to)
|
||||
+{
|
||||
+ int ret, hint = 0;
|
||||
+
|
||||
+ if (to)
|
||||
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
|
||||
+
|
||||
+ while (1) {
|
||||
+ ret = futex_wait_multiple_setup(vs, count, &hint);
|
||||
+ if (ret) {
|
||||
+ if (ret > 0) {
|
||||
+ /* A futex was woken during setup */
|
||||
+ ret = hint;
|
||||
+ }
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ futex_sleep_multiple(vs, count, to);
|
||||
+
|
||||
+ __set_current_state(TASK_RUNNING);
|
||||
+
|
||||
+ ret = unqueue_multiple(vs, count);
|
||||
+ if (ret >= 0)
|
||||
+ return ret;
|
||||
+
|
||||
+ if (to && !to->task)
|
||||
+ return -ETIMEDOUT;
|
||||
+ else if (signal_pending(current))
|
||||
+ return -ERESTARTSYS;
|
||||
+ /*
|
||||
+ * The final case is a spurious wakeup, for
|
||||
+ * which just retry.
|
||||
+ */
|
||||
+ }
|
||||
+}
|
||||
+/* Mask of available flags for each futex in futex_waitv list */
|
||||
+#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
|
||||
+
|
||||
+/**
|
||||
+ * sys_futex_waitv - Wait on a list of futexes
|
||||
+ * @waiters: List of futexes to wait on
|
||||
+ * @nr_futexes: Length of futexv
|
||||
+ * @flags: Flag for timeout (monotonic/realtime)
|
||||
+ * @timeout: Optional absolute timeout.
|
||||
+ * @clockid: Clock to be used for the timeout, realtime or monotonic.
|
||||
+ *
|
||||
+ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
|
||||
+ * if a futex_wake() is performed at any uaddr. The syscall returns immediately
|
||||
+ * if any waiter has *uaddr != val. *timeout is an optional timeout value for
|
||||
+ * the operation. Each waiter has individual flags. The `flags` argument for
|
||||
+ * the syscall should be used solely for specifying the timeout as realtime, if
|
||||
+ * needed. Flags for private futexes, sizes, etc. should be used on the
|
||||
+ * individual flags of each waiter.
|
||||
+ *
|
||||
+ * Returns the array index of one of the woken futexes. No further information
|
||||
+ * is provided: any number of other futexes may also have been woken by the
|
||||
+ * same event, and if more than one futex was woken, the retrned index may
|
||||
+ * refer to any one of them. (It is not necessaryily the futex with the
|
||||
+ * smallest index, nor the one most recently woken, nor...)
|
||||
+ */
|
||||
+
|
||||
+SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
|
||||
+ unsigned int, nr_futexes, unsigned int, flags,
|
||||
+ struct __kernel_timespec __user *, timeout, clockid_t, clockid)
|
||||
+{
|
||||
+ struct hrtimer_sleeper to;
|
||||
+ struct futex_vector *futexv;
|
||||
+ struct timespec64 ts;
|
||||
+ ktime_t time;
|
||||
+ int ret;
|
||||
+
|
||||
+ /* This syscall supports no flags for now */
|
||||
+ if (flags)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (timeout) {
|
||||
+ int flag_clkid = 0, flag_init = 0;
|
||||
+
|
||||
+ if (clockid == CLOCK_REALTIME) {
|
||||
+ flag_clkid = FLAGS_CLOCKRT;
|
||||
+ flag_init = FUTEX_CLOCK_REALTIME;
|
||||
+ }
|
||||
+
|
||||
+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (get_timespec64(&ts, timeout))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ /*
|
||||
+ * Since there's no opcode for futex_waitv, use
|
||||
+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
|
||||
+ */
|
||||
+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+
|
||||
+ futex_setup_timer(&time, &to, flag_clkid, 0);
|
||||
+ }
|
||||
+
|
||||
+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
|
||||
+ if (!futexv)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ ret = futex_parse_waitv(futexv, waiters, nr_futexes);
|
||||
+ if (!ret)
|
||||
+ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
|
||||
+
|
||||
+ if (timeout) {
|
||||
+ hrtimer_cancel(&to.timer);
|
||||
+ destroy_hrtimer_on_stack(&to.timer);
|
||||
+ }
|
||||
+
|
||||
+ kfree(futexv);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_COMPAT_32BIT_TIME
|
||||
SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
|
||||
const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
|
||||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
|
||||
index 0ea8128468c3..0979fac9414d 100644
|
||||
--- a/kernel/sys_ni.c
|
||||
+++ b/kernel/sys_ni.c
|
||||
@@ -150,6 +150,7 @@ COND_SYSCALL(set_robust_list);
|
||||
COND_SYSCALL_COMPAT(set_robust_list);
|
||||
COND_SYSCALL(get_robust_list);
|
||||
COND_SYSCALL_COMPAT(get_robust_list);
|
||||
+COND_SYSCALL(futex_waitv);
|
||||
|
||||
/* kernel/hrtimer.c */
|
||||
|
||||
--
|
||||
2.33.1
|
||||
|
||||
From 4e40f3886e134f33c50ca79bc8b323cea784bd78 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
|
||||
Date: Thu, 23 Sep 2021 14:11:06 -0300
|
||||
Subject: [PATCH 2/2] futex,x86: Wire up sys_futex_waitv()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Wire up syscall entry point for x86 arch, for both i386 and x86_64.
|
||||
|
||||
Signed-off-by: André Almeida <andrealmeid@collabora.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Link: https://lore.kernel.org/r/20210923171111.300673-18-andrealmeid@collabora.com
|
||||
---
|
||||
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
|
||||
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
|
||||
2 files changed, 2 insertions(+)
|
||||
|
||||
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
index 4bbc267fb36b..b2b9b9df1355 100644
|
||||
--- a/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
@@ -453,3 +453,4 @@
|
||||
446 i386 landlock_restrict_self sys_landlock_restrict_self
|
||||
447 i386 memfd_secret sys_memfd_secret
|
||||
448 i386 process_mrelease sys_process_mrelease
|
||||
+449 i386 futex_waitv sys_futex_waitv
|
||||
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
index ce18119ea0d0..bfd4e8f5be34 100644
|
||||
--- a/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
@@ -370,6 +370,7 @@
|
||||
446 common landlock_restrict_self sys_landlock_restrict_self
|
||||
447 common memfd_secret sys_memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
+449 common futex_waitv sys_futex_waitv
|
||||
|
||||
#
|
||||
# Due to a historical design error, certain syscalls are numbered differently
|
||||
--
|
||||
2.33.1
|
||||
|
Reference in New Issue
Block a user