From c85b04f64ed47e3acc45a1f8aade6882659cff51 Mon Sep 17 00:00:00 2001 From: Tk-Glitch Date: Thu, 2 Sep 2021 03:40:51 +0200 Subject: [PATCH] linux514-tkg: fsync & futex2 rebased patchsets bringup The updated collabora futex2-proton patchset is reportedly broken (newer api than proton?), so I have decided to rebase our 5.13 patch against 5.14 instead. Testing showed no issue with that approach. --- PKGBUILD | 5 +- linux-tkg-config/prepare | 2 +- linux-tkg-patches/5.14/0007-v5.14-fsync.patch | 575 +++ .../5.14/0007-v5.14-futex2_interface.patch | 3545 ++++++++++++----- 4 files changed, 3178 insertions(+), 949 deletions(-) create mode 100644 linux-tkg-patches/5.14/0007-v5.14-fsync.patch diff --git a/PKGBUILD b/PKGBUILD index 9fcb1e8..7917f99 100644 --- a/PKGBUILD +++ b/PKGBUILD @@ -556,7 +556,7 @@ case $_basever in 0003-glitched-cfs-additions.patch 0005-glitched-pds.patch 0006-add-acs-overrides_iommu.patch - #0007-v5.14-fsync.patch + 0007-v5.14-fsync.patch 0007-v5.14-futex2_interface.patch 0007-v5.14-winesync.patch #0008-5.14-bcachefs.patch @@ -581,7 +581,8 @@ case $_basever in 'e5ea0bb25ee294c655ac3cc30e1eea497799826108fbfb4ef3258c676c1e8a12' 'fca63d15ca4502aebd73e76d7499b243d2c03db71ff5ab0bf5cf268b2e576320' '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' - '07cbc31f9a92f5690babad6dadc4fa57ea23dec26b2fe08c8ba9f775e02a5d60' + 'aa67e81a27d9062e463594acb91eca6dd13388f23cbe53ca56298f9dba61cc10' + 'efe5e21706fdf64559ead866c85a5d88c5c3f743d814410df3810ca61cc5b966' '034d12a73b507133da2c69a34d61efd2f6b6618549650aa26d748142d22002e1' '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' 'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911' diff --git a/linux-tkg-config/prepare b/linux-tkg-config/prepare index a8e69dc..4ce441e 100644 --- a/linux-tkg-config/prepare +++ b/linux-tkg-config/prepare @@ -92,7 +92,7 @@ _set_kernel_version() { done #Default index corresponds to latest stable kernel - _default_index="1" + _default_index="0" _prompt_from_array "${_kernel_fullver_list[@]}" _version=${_supported_kernels[$_selected_index]} fi diff --git a/linux-tkg-patches/5.14/0007-v5.14-fsync.patch b/linux-tkg-patches/5.14/0007-v5.14-fsync.patch new file mode 100644 index 0000000..047eafd --- /dev/null +++ b/linux-tkg-patches/5.14/0007-v5.14-fsync.patch @@ -0,0 +1,575 @@ +From 93ea4d3978ab84892db3d44445bc12c51fa627e3 Mon Sep 17 00:00:00 2001 +From: Gabriel Krisman Bertazi +Date: Fri, 13 Dec 2019 11:08:02 -0300 +Subject: [PATCH] futex: Implement mechanism to wait on any of several futexes +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This is a new futex operation, called FUTEX_WAIT_MULTIPLE, which allows +a thread to wait on several futexes at the same time, and be awoken by +any of them. In a sense, it implements one of the features that was +supported by pooling on the old FUTEX_FD interface. + +The use case lies in the Wine implementation of the Windows NT interface +WaitMultipleObjects. This Windows API function allows a thread to sleep +waiting on the first of a set of event sources (mutexes, timers, signal, +console input, etc) to signal. Considering this is a primitive +synchronization operation for Windows applications, being able to quickly +signal events on the producer side, and quickly go to sleep on the +consumer side is essential for good performance of those running over Wine. + +Wine developers have an implementation that uses eventfd, but it suffers +from FD exhaustion (there is applications that go to the order of +multi-milion FDs), and higher CPU utilization than this new operation. + +The futex list is passed as an array of `struct futex_wait_block` +(pointer, value, bitset) to the kernel, which will enqueue all of them +and sleep if none was already triggered. It returns a hint of which +futex caused the wake up event to userspace, but the hint doesn't +guarantee that is the only futex triggered. Before calling the syscall +again, userspace should traverse the list, trying to re-acquire any of +the other futexes, to prevent an immediate -EWOULDBLOCK return code from +the kernel. + +This was tested using three mechanisms: + +1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and +running the unmodified tools/testing/selftests/futex and a full linux +distro on top of this kernel. + +2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a +multi-threaded, event-handling setup. + +3) By running the Wine fsync implementation and executing multi-threaded +applications, in particular modern games, on top of this implementation. + +Changes were tested for the following ABIs: x86_64, i386 and x32. +Support for x32 applications is not implemented since it would +take a major rework adding a new entry point and splitting the current +futex 64 entry point in two and we can't change the current x32 syscall +number without breaking user space compatibility. + +CC: Steven Rostedt +Cc: Richard Yao +Cc: Thomas Gleixner +Cc: Peter Zijlstra +Co-developed-by: Zebediah Figura +Signed-off-by: Zebediah Figura +Co-developed-by: Steven Noonan +Signed-off-by: Steven Noonan +Co-developed-by: Pierre-Loup A. Griffais +Signed-off-by: Pierre-Loup A. Griffais +Signed-off-by: Gabriel Krisman Bertazi +[Added compatibility code] +Co-developed-by: André Almeida +Signed-off-by: André Almeida + +Adjusted for v5.9: Removed `put_futex_key` calls. +--- + include/uapi/linux/futex.h | 20 +++ + kernel/futex.c | 348 +++++++++++++++++++++++++++++++++++++ + 2 files changed, 368 insertions(+) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e2e..580001e89c6cae 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,6 +21,7 @@ + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 + #define FUTEX_LOCK_PI2 13 ++#define FUTEX_WAIT_MULTIPLE 14 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -40,6 +41,8 @@ + FUTEX_PRIVATE_FLAG) + #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) ++#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ ++ FUTEX_PRIVATE_FLAG) + + /* + * Support for robust futexes: the kernel cleans up held futexes at +@@ -150,4 +153,21 @@ struct robust_list_head { + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + ++/* ++ * Maximum number of multiple futexes to wait for ++ */ ++#define FUTEX_MULTIPLE_MAX_COUNT 128 ++ ++/** ++ * struct futex_wait_block - Block of futexes to be waited for ++ * @uaddr: User address of the futex ++ * @val: Futex value expected by userspace ++ * @bitset: Bitset for the optional bitmasked wakeup ++ */ ++struct futex_wait_block { ++ __u32 __user *uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ + #endif /* _UAPI_LINUX_FUTEX_H */ +diff --git a/kernel/futex.c b/kernel/futex.c +index 408cad5e89680f..c30930a955cece 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -197,6 +197,8 @@ struct futex_pi_state { + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup ++ * @uaddr: userspace address of futex ++ * @uval: expected futex's value + * + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so + * we can wake only the relevant ones (hashed queues may be shared). +@@ -219,6 +221,8 @@ struct futex_q { + struct rt_mutex_waiter *rt_waiter; + union futex_key *requeue_pi_key; + u32 bitset; ++ u32 __user *uaddr; ++ u32 uval; + } __randomize_layout; + + static const struct futex_q futex_q_init = { +@@ -2316,6 +2320,29 @@ static int unqueue_me(struct futex_q *q) + return ret; + } + ++/** ++ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket ++ * @q: The list of futexes to unqueue ++ * @count: Number of futexes in the list ++ * ++ * Helper to unqueue a list of futexes. This can't fail. ++ * ++ * Return: ++ * - >=0 - Index of the last futex that was awoken; ++ * - -1 - If no futex was awoken ++ */ ++static int unqueue_multiple(struct futex_q *q, int count) ++{ ++ int ret = -1; ++ int i; ++ ++ for (i = 0; i < count; i++) { ++ if (!unqueue_me(&q[i])) ++ ret = i; ++ } ++ return ret; ++} ++ + /* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. +@@ -2679,6 +2706,205 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + return ret; + } + ++/** ++ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes ++ * @qs: The corresponding futex list ++ * @count: The size of the lists ++ * @flags: Futex flags (FLAGS_SHARED, etc.) ++ * @awaken: Index of the last awoken futex ++ * ++ * Prepare multiple futexes in a single step and enqueue them. This may fail if ++ * the futex list is invalid or if any futex was already awoken. On success the ++ * task is ready to interruptible sleep. ++ * ++ * Return: ++ * - 1 - One of the futexes was awaken by another thread ++ * - 0 - Success ++ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL ++ */ ++static int futex_wait_multiple_setup(struct futex_q *qs, int count, ++ unsigned int flags, int *awaken) ++{ ++ struct futex_hash_bucket *hb; ++ int ret, i; ++ u32 uval; ++ ++ /* ++ * Enqueuing multiple futexes is tricky, because we need to ++ * enqueue each futex in the list before dealing with the next ++ * one to avoid deadlocking on the hash bucket. But, before ++ * enqueuing, we need to make sure that current->state is ++ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which ++ * cannot be done before the get_futex_key of the next key, ++ * because it calls get_user_pages, which can sleep. Thus, we ++ * fetch the list of futexes keys in two steps, by first pinning ++ * all the memory keys in the futex key, and only then we read ++ * each key and queue the corresponding futex. ++ */ ++retry: ++ for (i = 0; i < count; i++) { ++ qs[i].key = FUTEX_KEY_INIT; ++ ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, ++ &qs[i].key, FUTEX_READ); ++ if (unlikely(ret)) { ++ return ret; ++ } ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ struct futex_q *q = &qs[i]; ++ ++ hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, q->uaddr); ++ if (ret) { ++ /* ++ * We need to try to handle the fault, which ++ * cannot be done without sleep, so we need to ++ * undo all the work already done, to make sure ++ * we don't miss any wake ups. Therefore, clean ++ * up, handle the fault and retry from the ++ * beginning. ++ */ ++ queue_unlock(hb); ++ ++ /* ++ * Keys 0..(i-1) are implicitly put ++ * on unqueue_multiple. ++ */ ++ *awaken = unqueue_multiple(qs, i); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * On a real fault, prioritize the error even if ++ * some other futex was awoken. Userspace gave ++ * us a bad address, -EFAULT them. ++ */ ++ ret = get_user(uval, q->uaddr); ++ if (ret) ++ return ret; ++ ++ /* ++ * Even if the page fault was handled, If ++ * something was already awaken, we can safely ++ * give up and succeed to give a hint for userspace to ++ * acquire the right futex faster. ++ */ ++ if (*awaken >= 0) ++ return 1; ++ ++ goto retry; ++ } ++ ++ if (uval != q->uval) { ++ queue_unlock(hb); ++ ++ /* ++ * If something was already awaken, we can ++ * safely ignore the error and succeed. ++ */ ++ *awaken = unqueue_multiple(qs, i); ++ __set_current_state(TASK_RUNNING); ++ if (*awaken >= 0) ++ return 1; ++ ++ return -EWOULDBLOCK; ++ } ++ ++ /* ++ * The bucket lock can't be held while dealing with the ++ * next futex. Queue each futex at this moment so hb can ++ * be unlocked. ++ */ ++ queue_me(&qs[i], hb); ++ } ++ return 0; ++} ++ ++/** ++ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes ++ * @qs: The list of futexes to wait on ++ * @op: Operation code from futex's syscall ++ * @count: The number of objects ++ * @abs_time: Timeout before giving up and returning to userspace ++ * ++ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function ++ * sleeps on a group of futexes and returns on the first futex that ++ * triggered, or after the timeout has elapsed. ++ * ++ * Return: ++ * - >=0 - Hint to the futex that was awoken ++ * - <0 - On error ++ */ ++static int futex_wait_multiple(struct futex_q *qs, int op, ++ u32 count, ktime_t *abs_time) ++{ ++ struct hrtimer_sleeper timeout, *to; ++ int ret, flags = 0, hint = 0; ++ unsigned int i; ++ ++ if (!(op & FUTEX_PRIVATE_FLAG)) ++ flags |= FLAGS_SHARED; ++ ++ if (op & FUTEX_CLOCK_REALTIME) ++ flags |= FLAGS_CLOCKRT; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, 0); ++ while (1) { ++ ret = futex_wait_multiple_setup(qs, count, flags, &hint); ++ if (ret) { ++ if (ret > 0) { ++ /* A futex was awaken during setup */ ++ ret = hint; ++ } ++ break; ++ } ++ ++ if (to) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ /* ++ * Avoid sleeping if another thread already tried to ++ * wake us. ++ */ ++ for (i = 0; i < count; i++) { ++ if (plist_node_empty(&qs[i].list)) ++ break; ++ } ++ ++ if (i == count && (!to || to->task)) ++ freezable_schedule(); ++ ++ ret = unqueue_multiple(qs, count); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ if (ret >= 0) ++ break; ++ if (to && !to->task) { ++ ret = -ETIMEDOUT; ++ break; ++ } else if (signal_pending(current)) { ++ ret = -ERESTARTSYS; ++ break; ++ } ++ /* ++ * The final case is a spurious wakeup, for ++ * which just retry. ++ */ ++ } ++ ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ ++ return ret; ++} ++ + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) + { +@@ -3763,6 +3989,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd) + case FUTEX_LOCK_PI: + case FUTEX_WAIT_BITSET: + case FUTEX_WAIT_REQUEUE_PI: ++ case FUTEX_WAIT_MULTIPLE: + return true; + } + return false; +@@ -3782,6 +4009,44 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) + return 0; + } + ++/** ++ * futex_read_wait_block - Read an array of futex_wait_block from userspace ++ * @uaddr: Userspace address of the block ++ * @count: Number of blocks to be read ++ * ++ * This function creates and allocate an array of futex_q (we zero it to ++ * initialize the fields) and then, for each futex_wait_block element from ++ * userspace, fill a futex_q element with proper values. ++ */ ++inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) ++{ ++ unsigned int i; ++ struct futex_q *qs; ++ struct futex_wait_block fwb; ++ struct futex_wait_block __user *entry = ++ (struct futex_wait_block __user *)uaddr; ++ ++ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) ++ return ERR_PTR(-EINVAL); ++ ++ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); ++ if (!qs) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { ++ kfree(qs); ++ return ERR_PTR(-EFAULT); ++ } ++ ++ qs[i].uaddr = fwb.uaddr; ++ qs[i].uval = fwb.val; ++ qs[i].bitset = fwb.bitset; ++ } ++ ++ return qs; ++} ++ + SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + const struct __kernel_timespec __user *, utime, + u32 __user *, uaddr2, u32, val3) +@@ -3801,6 +4066,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + tp = &t; + } + ++ if (cmd == FUTEX_WAIT_MULTIPLE) { ++ int ret; ++ struct futex_q *qs; ++ ++#ifdef CONFIG_X86_X32 ++ if (unlikely(in_x32_syscall())) ++ return -ENOSYS; ++#endif ++ qs = futex_read_wait_block(uaddr, val); ++ ++ if (IS_ERR(qs)) ++ return PTR_ERR(qs); ++ ++ ret = futex_wait_multiple(qs, op, val, tp); ++ kfree(qs); ++ ++ return ret; ++ } ++ + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); + } + +@@ -3963,6 +4247,57 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + #endif /* CONFIG_COMPAT */ + + #ifdef CONFIG_COMPAT_32BIT_TIME ++/** ++ * struct compat_futex_wait_block - Block of futexes to be waited for ++ * @uaddr: User address of the futex (compatible pointer) ++ * @val: Futex value expected by userspace ++ * @bitset: Bitset for the optional bitmasked wakeup ++ */ ++struct compat_futex_wait_block { ++ compat_uptr_t uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ ++/** ++ * compat_futex_read_wait_block - Read an array of futex_wait_block from ++ * userspace ++ * @uaddr: Userspace address of the block ++ * @count: Number of blocks to be read ++ * ++ * This function does the same as futex_read_wait_block(), except that it ++ * converts the pointer to the futex from the compat version to the regular one. ++ */ ++inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, ++ u32 count) ++{ ++ unsigned int i; ++ struct futex_q *qs; ++ struct compat_futex_wait_block fwb; ++ struct compat_futex_wait_block __user *entry = ++ (struct compat_futex_wait_block __user *)uaddr; ++ ++ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) ++ return ERR_PTR(-EINVAL); ++ ++ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); ++ if (!qs) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { ++ kfree(qs); ++ return ERR_PTR(-EFAULT); ++ } ++ ++ qs[i].uaddr = compat_ptr(fwb.uaddr); ++ qs[i].uval = fwb.val; ++ qs[i].bitset = fwb.bitset; ++ } ++ ++ return qs; ++} ++ + SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, + u32, val3) +@@ -3980,6 +4315,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + tp = &t; + } + ++ if (cmd == FUTEX_WAIT_MULTIPLE) { ++ int ret; ++ struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); ++ ++ if (IS_ERR(qs)) ++ return PTR_ERR(qs); ++ ++ ret = futex_wait_multiple(qs, op, val, tp); ++ kfree(qs); ++ ++ return ret; ++ } ++ + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); + } + #endif /* CONFIG_COMPAT_32BIT_TIME */ +From 74d0568c6f3aa4a5f2682f6d6b5a4d59044a762e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Fri, 7 Feb 2020 23:28:02 -0300 +Subject: [PATCH] futex: Add Proton compatibility code + +--- + include/uapi/linux/futex.h | 2 +- + kernel/futex.c | 3 ++- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index 580001e89c6cae..a3e760886b8e7e 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,7 +21,7 @@ + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 + #define FUTEX_LOCK_PI2 13 +-#define FUTEX_WAIT_MULTIPLE 14 ++#define FUTEX_WAIT_MULTIPLE 31 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +diff --git a/kernel/futex.c b/kernel/futex.c +index c30930a955cece..aa33e66df5c9df 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -4002,7 +4002,7 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) + return -EINVAL; + + *t = timespec64_to_ktime(*ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + *t = ktime_add_safe(ktime_get(), *t); + else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) + *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); +@@ -4255,6 +4255,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + */ + struct compat_futex_wait_block { + compat_uptr_t uaddr; ++ __u32 pad; + __u32 val; + __u32 bitset; + }; diff --git a/linux-tkg-patches/5.14/0007-v5.14-futex2_interface.patch b/linux-tkg-patches/5.14/0007-v5.14-futex2_interface.patch index 92c719b..41d0189 100644 --- a/linux-tkg-patches/5.14/0007-v5.14-futex2_interface.patch +++ b/linux-tkg-patches/5.14/0007-v5.14-futex2_interface.patch @@ -1,16 +1,20 @@ -From f5fab7a32aae5148b5b50ada2625c2f9f16e2084 Mon Sep 17 00:00:00 2001 +From ed1408eb394c22190c04ce29f859114b34891bec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Thu, 17 Jun 2021 11:50:20 -0300 -Subject: [PATCH 01/10] futex2: Implement wait and wake functions +Date: Fri, 5 Feb 2021 10:34:00 -0300 +Subject: [PATCH 01/14] futex2: Implement wait and wake functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a new set of futex syscalls known as futex2. This new interface -is aimed to expand it with new functionalities without modifying the -current complex interface. +is aimed to implement a more maintainable code, while removing obsolete +features and expanding it with new functionalities. -Implement wait and wake functions with support for 32 sized futexes: +Implements wait and wake semantics for futexes, along with the base +infrastructure for future operations. The whole wait path is designed to +be used by N waiters, thus making easier to implement vectorized wait. + +* Syscalls implemented by this patch: - futex_wait(void *uaddr, unsigned int val, unsigned int flags, struct timespec *timo) @@ -31,75 +35,131 @@ Implement wait and wake functions with support for 32 sized futexes: ** The `flag` argument The flag is used to specify the size of the futex word - (FUTEX_[8, 16, 32, 64]). It's mandatory to define one. + (FUTEX_[8, 16, 32]). It's mandatory to define one, since there's no + default size. - By default, the timeout uses a monotonic clock, but can be used as a - realtime one by using the FUTEX_REALTIME_CLOCK flag. + By default, the timeout uses a monotonic clock, but can be used as a realtime + one by using the FUTEX_REALTIME_CLOCK flag. - By default, futexes are of the private type, that means that this user - address will be accessed by threads that shares the same memory region. - This allows for some internal optimizations, so they are faster. - However, if the address needs to be shared with different processes - (like using `mmap()` or `shm()`), they need to be defined as shared and - the flag FUTEX_SHARED_FLAG is used to set that. + By default, futexes are of the private type, that means that this user address + will be accessed by threads that shares the same memory region. This allows for + some internal optimizations, so they are faster. However, if the address needs + to be shared with different processes (like using `mmap()` or `shm()`), they + need to be defined as shared and the flag FUTEX_SHARED_FLAG is used to set that. - By default, the operation has no NUMA-awareness, meaning that the user - can't choose the memory node where the kernel side futex data will be - stored. The user can choose the node where it wants to operate by - setting the FUTEX_NUMA_FLAG and using the following structure (where X - can be 8, 16, or 32, 64): + By default, the operation has no NUMA-awareness, meaning that the user can't + choose the memory node where the kernel side futex data will be stored. The + user can choose the node where it wants to operate by setting the + FUTEX_NUMA_FLAG and using the following structure (where X can be 8, 16, or + 32): struct futexX_numa { __uX value; __sX hint; }; - This structure should be passed at the `void *uaddr` of futex - functions. The address of the structure will be used to be waited/waken - on, and the `value` will be compared to `val` as usual. The `hint` - member is used to defined which node the futex will use. When waiting, - the futex will be registered on a kernel-side table stored on that - node; when waking, the futex will be searched for on that given table. - That means that there's no redundancy between tables, and the wrong - `hint` value will led to undesired behavior. Userspace is responsible - for dealing with node migrations issues that may occur. `hint` can - range from [0, MAX_NUMA_NODES], for specifying a node, or -1, to use - the same node the current process is using. + This structure should be passed at the `void *uaddr` of futex functions. The + address of the structure will be used to be waited/waken on, and the + `value` will be compared to `val` as usual. The `hint` member is used to + defined which node the futex will use. When waiting, the futex will be + registered on a kernel-side table stored on that node; when waking, the futex + will be searched for on that given table. That means that there's no redundancy + between tables, and the wrong `hint` value will led to undesired behavior. + Userspace is responsible for dealing with node migrations issues that may + occur. `hint` can range from [0, MAX_NUMA_NODES], for specifying a node, or + -1, to use the same node the current process is using. - When not using FUTEX_NUMA_FLAG on a NUMA system, the futex will be - stored on a global table on some node, defined at compilation time. + When not using FUTEX_NUMA_FLAG on a NUMA system, the futex will be stored on a + global table on some node, defined at compilation time. ** The `timo` argument -As per the Y2038 work done in the kernel, new interfaces shouldn't add -timeout options known to be buggy. Given that, `timo` should be a 64bit -timeout at all platforms, using an absolute timeout value. +As per the Y2038 work done in the kernel, new interfaces shouldn't add timeout +options known to be buggy. Given that, `timo` should be a 64bit timeout at +all platforms, using an absolute timeout value. -Signed-off-by: André Almeida +Signed-off-by: André Almeida + +Rebased-by: Joshua Ashton --- - arch/x86/entry/syscalls/syscall_32.tbl | 2 + - arch/x86/entry/syscalls/syscall_64.tbl | 2 + - include/linux/futex.h | 22 +++++++ - include/linux/syscalls.h | 6 ++ - include/uapi/asm-generic/unistd.h | 7 ++- - include/uapi/linux/futex.h | 4 +- - init/Kconfig | 7 +++ - kernel/Makefile | 1 + - kernel/futex.c | 23 +------- - kernel/futex2.c | 82 ++++++++++++++++++++++++++ - kernel/sys_ni.c | 5 ++ - 11 files changed, 139 insertions(+), 22 deletions(-) + MAINTAINERS | 2 +- + arch/arm/tools/syscall.tbl | 2 + + arch/arm64/include/asm/unistd.h | 2 +- + arch/arm64/include/asm/unistd32.h | 4 + + arch/x86/entry/syscalls/syscall_32.tbl | 2 + + arch/x86/entry/syscalls/syscall_64.tbl | 2 + + include/linux/syscalls.h | 7 + + include/uapi/asm-generic/unistd.h | 8 +- + include/uapi/linux/futex.h | 5 + + init/Kconfig | 7 + + kernel/Makefile | 1 + + kernel/futex2.c | 619 ++++++++++++++++++ + kernel/sys_ni.c | 4 + + tools/include/uapi/asm-generic/unistd.h | 8 +- + .../arch/x86/entry/syscalls/syscall_64.tbl | 2 + + 15 files changed, 671 insertions(+), 4 deletions(-) create mode 100644 kernel/futex2.c +diff --git a/MAINTAINERS b/MAINTAINERS +index 673cadd5107a..b4b81b9a6e37 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -7521,7 +7521,7 @@ F: Documentation/locking/*futex* + F: include/asm-generic/futex.h + F: include/linux/futex.h + F: include/uapi/linux/futex.h +-F: kernel/futex.c ++F: kernel/futex* + F: tools/perf/bench/futex* + F: tools/testing/selftests/futex/ + +diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl +index 28e03b5fec00..b60a8bdab623 100644 +--- a/arch/arm/tools/syscall.tbl ++++ b/arch/arm/tools/syscall.tbl +@@ -460,3 +460,5 @@ + 444 common landlock_create_ruleset sys_landlock_create_ruleset + 445 common landlock_add_rule sys_landlock_add_rule + 446 common landlock_restrict_self sys_landlock_restrict_self ++447 common futex_wait sys_futex_wait ++448 common futex_wake sys_futex_wake +diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h +index 727bfc3be99b..3cb206aea3db 100644 +--- a/arch/arm64/include/asm/unistd.h ++++ b/arch/arm64/include/asm/unistd.h +@@ -38,7 +38,7 @@ + #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) + #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) + +-#define __NR_compat_syscalls 447 ++#define __NR_compat_syscalls 449 + #endif + + #define __ARCH_WANT_SYS_CLONE +diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h +index 5dab69d2c22b..1749cc108449 100644 +--- a/arch/arm64/include/asm/unistd32.h ++++ b/arch/arm64/include/asm/unistd32.h +@@ -900,6 +900,10 @@ __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) + __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) + #define __NR_landlock_restrict_self 446 + __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) ++#define __NR_futex_wait 447 ++__SYSCALL(__NR_futex_wait, sys_futex_wait) ++#define __NR_futex_wake 448 ++__SYSCALL(__NR_futex_wake, sys_futex_wake) + + /* + * Please add new compat syscalls above this comment and update diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl -index 4bbc267fb36b..e3b827a9c094 100644 +index 4bbc267fb36b..f75de79fa93d 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -451,3 +451,5 @@ 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self 447 i386 memfd_secret sys_memfd_secret -+448 i386 futex_wait sys_futex_wait compat_sys_futex_wait ++448 i386 futex_wait sys_futex_wait +449 i386 futex_wake sys_futex_wake diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index ce18119ea0d0..63b447255df2 100644 @@ -114,49 +174,17 @@ index ce18119ea0d0..63b447255df2 100644 # # Due to a historical design error, certain syscalls are numbered differently -diff --git a/include/linux/futex.h b/include/linux/futex.h -index b70df27d7e85..abcc001f992a 100644 ---- a/include/linux/futex.h -+++ b/include/linux/futex.h -@@ -77,6 +77,28 @@ void futex_exec_release(struct task_struct *tsk); - - long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - u32 __user *uaddr2, u32 val2, u32 val3); -+ -+int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset); -+ -+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, -+ u32 bitset); -+ -+/* -+ * Futex flags used to encode options to functions and preserve them across -+ * restarts. -+ */ -+#ifdef CONFIG_MMU -+# define FLAGS_SHARED 0x01 -+#else -+/* -+ * NOMMU does not have per process address space. Let the compiler optimize -+ * code away. -+ */ -+# define FLAGS_SHARED 0x00 -+#endif -+#define FLAGS_CLOCKRT 0x02 -+#define FLAGS_HAS_TIMEOUT 0x04 -+ - #else - static inline void futex_init_task(struct task_struct *tsk) { } - static inline void futex_exit_recursive(struct task_struct *tsk) { } diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h -index 050511e8f1f8..b9c2874410d0 100644 +index 050511e8f1f8..0f9b64cc34f7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h -@@ -623,6 +623,12 @@ asmlinkage long sys_get_robust_list(int pid, +@@ -623,6 +623,13 @@ asmlinkage long sys_get_robust_list(int pid, asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); +/* kernel/futex2.c */ -+asmlinkage long sys_futex_wait(void __user *uaddr, u64 val, unsigned int flags, ++asmlinkage long sys_futex_wait(void __user *uaddr, unsigned int val, ++ unsigned int flags, + struct __kernel_timespec __user *timo); +asmlinkage long sys_futex_wake(void __user *uaddr, unsigned int nr_wake, + unsigned int flags); @@ -165,16 +193,17 @@ index 050511e8f1f8..b9c2874410d0 100644 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h -index d2a942086fcb..df9fe2e23ee0 100644 +index 6de5a7fc066b..2a62ecca2b00 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h -@@ -872,8 +872,13 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) +@@ -873,8 +873,14 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) __SYSCALL(__NR_memfd_secret, sys_memfd_secret) #endif -+#define __NR_futex_wait 448 -+__SC_COMP(__NR_futex_wait, sys_futex_wait, compat_sys_futex_wait) -+#define __NR_futex_wake 449 ++#define __NR_futex_wait 443 ++__SYSCALL(__NR_futex_wait, sys_futex_wait) ++ ++#define __NR_futex_wake 444 +__SYSCALL(__NR_futex_wake, sys_futex_wake) + #undef __NR_syscalls @@ -184,17 +213,18 @@ index d2a942086fcb..df9fe2e23ee0 100644 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index 235e5b2facaa..44750caa261e 100644 +index a89eb0accd5e..8d30f4b6d094 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h -@@ -42,7 +42,9 @@ - FUTEX_PRIVATE_FLAG) +@@ -41,6 +41,11 @@ #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) -- + ++/* Size argument to futex2 syscall */ +#define FUTEX_32 2 -+#define FUTEX_SHARED_FLAG 8 ++ +#define FUTEX_SIZE_MASK 0x3 ++ /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -228,165 +258,1096 @@ index 4df609be42d0..1eaf2af50283 100644 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) -diff --git a/kernel/futex.c b/kernel/futex.c -index 2ecb07575055..ef7131bd8bc4 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -150,22 +150,6 @@ - static int __read_mostly futex_cmpxchg_enabled; - #endif - --/* -- * Futex flags used to encode options to functions and preserve them across -- * restarts. -- */ --#ifdef CONFIG_MMU --# define FLAGS_SHARED 0x01 --#else --/* -- * NOMMU does not have per process address space. Let the compiler optimize -- * code away. -- */ --# define FLAGS_SHARED 0x00 --#endif --#define FLAGS_CLOCKRT 0x02 --#define FLAGS_HAS_TIMEOUT 0x04 -- - /* - * Priority Inheritance state: - */ -@@ -1588,8 +1572,7 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) - /* - * Wake up waiters matching bitset queued on this futex (uaddr). - */ --static int --futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) -+int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) - { - struct futex_hash_bucket *hb; - struct futex_q *this, *next; -@@ -2676,8 +2659,8 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - return ret; - } - --static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, -- ktime_t *abs_time, u32 bitset) -+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, -+ ktime_t *abs_time, u32 bitset) - { - struct hrtimer_sleeper timeout, *to; - struct restart_block *restart; diff --git a/kernel/futex2.c b/kernel/futex2.c new file mode 100644 -index 000000000000..4db771db48ee +index 000000000000..ade407c1abb7 --- /dev/null +++ b/kernel/futex2.c -@@ -0,0 +1,82 @@ +@@ -0,0 +1,619 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -+ * futex2 system call interface by André Almeida ++ * futex2 system call interface by André Almeida + * + * Copyright 2021 Collabora Ltd. ++ * ++ * Based on original futex implementation by: ++ * (C) 2002 Rusty Russell, IBM ++ * (C) 2003, 2006 Ingo Molnar, Red Hat Inc. ++ * (C) 2003, 2004 Jamie Lokier ++ * (C) 2006 Thomas Gleixner, Timesys Corp. ++ * (C) 2007 Eric Dumazet ++ * (C) 2009 Darren Hart, IBM + */ + ++#include ++#include ++#include ++#include ++#include +#include ++#include + -+#include ++/** ++ * struct futex_key - Components to build unique key for a futex ++ * @pointer: Pointer to current->mm ++ * @index: Start address of the page containing futex ++ * @offset: Address offset of uaddr in a page ++ */ ++struct futex_key { ++ u64 pointer; ++ unsigned long index; ++ unsigned long offset; ++}; ++ ++/** ++ * struct futex_waiter - List entry for a waiter ++ * @uaddr: Virtual address of userspace futex ++ * @key: Information that uniquely identify a futex ++ * @list: List node struct ++ * @val: Expected value for this waiter ++ * @flags: Flags ++ * @bucket: Pointer to the bucket for this waiter ++ * @index: Index of waiter in futexv list ++ */ ++struct futex_waiter { ++ void __user *uaddr; ++ struct futex_key key; ++ struct list_head list; ++ unsigned int val; ++ unsigned int flags; ++ struct futex_bucket *bucket; ++ unsigned int index; ++}; ++ ++/** ++ * struct futex_waiter_head - List of futexes to be waited ++ * @task: Task to be awaken ++ * @hint: Was someone on this list awakened? ++ * @objects: List of futexes ++ */ ++struct futex_waiter_head { ++ struct task_struct *task; ++ bool hint; ++ struct futex_waiter objects[0]; ++}; ++ ++/** ++ * struct futex_bucket - A bucket of futex's hash table ++ * @waiters: Number of waiters in the bucket ++ * @lock: Bucket lock ++ * @list: List of waiters on this bucket ++ */ ++struct futex_bucket { ++ atomic_t waiters; ++ spinlock_t lock; ++ struct list_head list; ++}; ++ ++ ++/* Mask for futex2 flag operations */ ++#define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_CLOCK_REALTIME) ++ ++static struct futex_bucket *futex_table; ++static unsigned int futex2_hashsize; + +/* -+ * Set of flags that futex2 operates. If we got something that is not in this -+ * set, it can be a unsupported futex1 operation like BITSET or PI, so we -+ * refuse to accept ++ * Reflects a new waiter being added to the waitqueue. + */ -+#define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG | FUTEX_CLOCK_REALTIME) -+ -+static long ksys_futex_wait(void __user *uaddr, u64 val, unsigned int flags, -+ struct __kernel_timespec __user *timo) ++static inline void bucket_inc_waiters(struct futex_bucket *bucket) +{ -+ unsigned int size = flags & FUTEX_SIZE_MASK, futex_flags = 0; -+ ktime_t *kt = NULL, time; ++#ifdef CONFIG_SMP ++ atomic_inc(&bucket->waiters); ++ /* ++ * Issue a barrier after adding so futex_wake() will see that the ++ * value had increased ++ */ ++ smp_mb__after_atomic(); ++#endif ++} ++ ++/* ++ * Reflects a waiter being removed from the waitqueue by wakeup ++ * paths. ++ */ ++static inline void bucket_dec_waiters(struct futex_bucket *bucket) ++{ ++#ifdef CONFIG_SMP ++ atomic_dec(&bucket->waiters); ++#endif ++} ++ ++/* ++ * Get the number of waiters in a bucket ++ */ ++static inline int bucket_get_waiters(struct futex_bucket *bucket) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * Issue a barrier before reading so we get an updated value from ++ * futex_wait() ++ */ ++ smp_mb(); ++ return atomic_read(&bucket->waiters); ++#else ++ return 1; ++#endif ++} ++ ++/** ++ * futex_get_bucket - Check if the user address is valid, prepare internal ++ * data and calculate the hash ++ * @uaddr: futex user address ++ * @key: data that uniquely identifies a futex ++ * ++ * Return: address of bucket on success, error code otherwise ++ */ ++static struct futex_bucket *futex_get_bucket(void __user *uaddr, ++ struct futex_key *key) ++{ ++ uintptr_t address = (uintptr_t)uaddr; ++ u32 hash_key; ++ ++ /* Checking if uaddr is valid and accessible */ ++ if (unlikely(!IS_ALIGNED(address, sizeof(u32)))) ++ return ERR_PTR(-EINVAL); ++ if (unlikely(!access_ok(uaddr, sizeof(u32)))) ++ return ERR_PTR(-EFAULT); ++ ++ key->offset = address % PAGE_SIZE; ++ address -= key->offset; ++ key->pointer = (u64)address; ++ key->index = (unsigned long)current->mm; ++ ++ /* Generate hash key for this futex using uaddr and current->mm */ ++ hash_key = jhash2((u32 *)key, sizeof(*key) / sizeof(u32), 0); ++ ++ /* Since HASH_SIZE is 2^n, subtracting 1 makes a perfect bit mask */ ++ return &futex_table[hash_key & (futex2_hashsize - 1)]; ++} ++ ++/** ++ * futex_get_user - Get the userspace value on this address ++ * @uval: variable to store the value ++ * @uaddr: userspace address ++ * ++ * Check the comment at futex_enqueue() for more information. ++ */ ++static int futex_get_user(u32 *uval, u32 __user *uaddr) ++{ ++ int ret; ++ ++ pagefault_disable(); ++ ret = __get_user(*uval, uaddr); ++ pagefault_enable(); ++ ++ return ret; ++} ++ ++/** ++ * futex_setup_time - Prepare the timeout mechanism and start it. ++ * @timo: Timeout value from userspace ++ * @timeout: Pointer to hrtimer handler ++ * @flags: Flags from userspace, to decide which clockid to use ++ * ++ * Return: 0 on success, error code otherwise ++ */ ++static int futex_setup_time(struct __kernel_timespec __user *timo, ++ struct hrtimer_sleeper *timeout, ++ unsigned int flags) ++{ ++ ktime_t time; + struct timespec64 ts; ++ clockid_t clockid = (flags & FUTEX_CLOCK_REALTIME) ? ++ CLOCK_REALTIME : CLOCK_MONOTONIC; ++ ++ if (get_timespec64(&ts, timo)) ++ return -EFAULT; ++ ++ if (!timespec64_valid(&ts)) ++ return -EINVAL; ++ ++ time = timespec64_to_ktime(ts); ++ ++ hrtimer_init_sleeper(timeout, clockid, HRTIMER_MODE_ABS); ++ ++ hrtimer_set_expires(&timeout->timer, time); ++ ++ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); ++ ++ return 0; ++} ++ ++/** ++ * futex_dequeue_multiple - Remove multiple futexes from hash table ++ * @futexv: list of waiters ++ * @nr: number of futexes to be removed ++ * ++ * This function is used if (a) something went wrong while enqueuing, and we ++ * need to undo our work (then nr <= nr_futexes) or (b) we woke up, and thus ++ * need to remove every waiter, check if some was indeed woken and return. ++ * Before removing a waiter, we check if it's on the list, since we have no ++ * clue who have been waken. ++ * ++ * Return: ++ * * -1 - If no futex was woken during the removal ++ * * 0>= - At least one futex was found woken, index of the last one ++ */ ++static int futex_dequeue_multiple(struct futex_waiter_head *futexv, unsigned int nr) ++{ ++ int i, ret = -1; ++ ++ for (i = 0; i < nr; i++) { ++ spin_lock(&futexv->objects[i].bucket->lock); ++ if (!list_empty(&futexv->objects[i].list)) { ++ list_del_init(&futexv->objects[i].list); ++ bucket_dec_waiters(futexv->objects[i].bucket); ++ } else { ++ ret = i; ++ } ++ spin_unlock(&futexv->objects[i].bucket->lock); ++ } ++ ++ return ret; ++} ++ ++/** ++ * futex_enqueue - Check the value and enqueue a futex on a wait list ++ * ++ * @futexv: List of futexes ++ * @nr_futexes: Number of futexes in the list ++ * @awakened: If a futex was awakened during enqueueing, store the index here ++ * ++ * Get the value from the userspace address and compares with the expected one. ++ * ++ * Getting the value from user futex address: ++ * ++ * Since we are in a hurry, we use a spin lock and we can't sleep. ++ * Try to get the value with page fault disabled (when enable, we might ++ * sleep). ++ * ++ * If we fail, we aren't sure if the address is invalid or is just a ++ * page fault. Then, release the lock (so we can sleep) and try to get ++ * the value with page fault enabled. In order to trigger a page fault ++ * handling, we just call __get_user() again. If we sleep with enqueued ++ * futexes, we might miss a wake, so dequeue everything before sleeping. ++ * ++ * If get_user succeeds, this mean that the address is valid and we do ++ * the work again. Since we just handled the page fault, the page is ++ * likely pinned in memory and we should be luckier this time and be ++ * able to get the value. If we fail anyway, we will try again. ++ * ++ * If even with page faults enabled we get and error, this means that ++ * the address is not valid and we return from the syscall. ++ * ++ * If we got an unexpected value or need to treat a page fault and realized that ++ * a futex was awakened, we can priority this and return success. ++ * ++ * In success, enqueue the futex in the correct bucket ++ * ++ * Return: ++ * * 1 - We were awake in the process and nothing is enqueued ++ * * 0 - Everything is enqueued and we are ready to sleep ++ * * 0< - Something went wrong, nothing is enqueued, return error code ++ */ ++static int futex_enqueue(struct futex_waiter_head *futexv, unsigned int nr_futexes, ++ int *awakened) ++{ ++ int i, ret; ++ u32 uval, val; ++ u32 __user *uaddr; ++ struct futex_bucket *bucket; ++ ++retry: ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < nr_futexes; i++) { ++ uaddr = (u32 __user *)futexv->objects[i].uaddr; ++ val = (u32)futexv->objects[i].val; ++ ++ bucket = futexv->objects[i].bucket; ++ ++ bucket_inc_waiters(bucket); ++ spin_lock(&bucket->lock); ++ ++ ret = futex_get_user(&uval, uaddr); ++ ++ if (unlikely(ret)) { ++ spin_unlock(&bucket->lock); ++ ++ bucket_dec_waiters(bucket); ++ __set_current_state(TASK_RUNNING); ++ *awakened = futex_dequeue_multiple(futexv, i); ++ ++ if (*awakened >= 0) ++ return 1; ++ ++ if (__get_user(uval, uaddr)) ++ return -EFAULT; ++ ++ goto retry; ++ } ++ ++ if (uval != val) { ++ spin_unlock(&bucket->lock); ++ ++ bucket_dec_waiters(bucket); ++ __set_current_state(TASK_RUNNING); ++ *awakened = futex_dequeue_multiple(futexv, i); ++ ++ if (*awakened >= 0) ++ return 1; ++ ++ return -EAGAIN; ++ } ++ ++ list_add_tail(&futexv->objects[i].list, &bucket->list); ++ spin_unlock(&bucket->lock); ++ } ++ ++ return 0; ++} ++ ++/** ++ * __futex_wait - Enqueue the list of futexes and wait to be woken ++ * @futexv: List of futexes to wait ++ * @nr_futexes: Length of futexv ++ * @timeout: Pointer to timeout handler ++ * ++ * Return: ++ * * 0 >= - Hint of which futex woke us ++ * * 0 < - Error code ++ */ ++static int __futex_wait(struct futex_waiter_head *futexv, unsigned int nr_futexes, ++ struct hrtimer_sleeper *timeout) ++{ ++ int ret; ++ ++ while (1) { ++ int awakened = -1; ++ ++ ret = futex_enqueue(futexv, nr_futexes, &awakened); ++ ++ if (ret) { ++ if (awakened >= 0) ++ return awakened; ++ return ret; ++ } ++ ++ /* Before sleeping, check if someone was woken */ ++ if (!futexv->hint && (!timeout || timeout->task)) ++ freezable_schedule(); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * One of those things triggered this wake: ++ * ++ * * We have been removed from the bucket. futex_wake() woke ++ * us. We just need to dequeue and return 0 to userspace. ++ * ++ * However, if no futex was dequeued by a futex_wake(): ++ * ++ * * If the there's a timeout and it has expired, ++ * return -ETIMEDOUT. ++ * ++ * * If there is a signal pending, something wants to kill our ++ * thread, return -ERESTARTSYS. ++ * ++ * * If there's no signal pending, it was a spurious wake ++ * (scheduler gave us a chance to do some work, even if we ++ * don't want to). We need to remove ourselves from the ++ * bucket and add again, to prevent losing wakeups in the ++ * meantime. ++ */ ++ ++ ret = futex_dequeue_multiple(futexv, nr_futexes); ++ ++ /* Normal wake */ ++ if (ret >= 0) ++ return ret; ++ ++ if (timeout && !timeout->task) ++ return -ETIMEDOUT; ++ ++ if (signal_pending(current)) ++ return -ERESTARTSYS; ++ ++ /* Spurious wake, do everything again */ ++ } ++} ++ ++/** ++ * futex_wait - Setup the timer (if there's one) and wait on a list of futexes ++ * @futexv: List of futexes ++ * @nr_futexes: Length of futexv ++ * @timo: Timeout ++ * @flags: Timeout flags ++ * ++ * Return: ++ * * 0 >= - Hint of which futex woke us ++ * * 0 < - Error code ++ */ ++static int futex_set_timer_and_wait(struct futex_waiter_head *futexv, ++ unsigned int nr_futexes, ++ struct __kernel_timespec __user *timo, ++ unsigned int flags) ++{ ++ struct hrtimer_sleeper timeout; ++ int ret; ++ ++ if (timo) { ++ ret = futex_setup_time(timo, &timeout, flags); ++ if (ret) ++ return ret; ++ } ++ ++ ret = __futex_wait(futexv, nr_futexes, timo ? &timeout : NULL); ++ ++ if (timo) ++ hrtimer_cancel(&timeout.timer); ++ ++ return ret; ++} ++ ++/** ++ * sys_futex_wait - Wait on a futex address if (*uaddr) == val ++ * @uaddr: User address of futex ++ * @val: Expected value of futex ++ * @flags: Specify the size of futex and the clockid ++ * @timo: Optional absolute timeout. ++ * ++ * The user thread is put to sleep, waiting for a futex_wake() at uaddr, if the ++ * value at *uaddr is the same as val (otherwise, the syscall returns ++ * immediately with -EAGAIN). ++ * ++ * Returns 0 on success, error code otherwise. ++ */ ++SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, ++ unsigned int, flags, struct __kernel_timespec __user *, timo) ++{ ++ unsigned int size = flags & FUTEX_SIZE_MASK; ++ struct futex_waiter *waiter; ++ struct futex_waiter_head *futexv; ++ ++ /* Wrapper for a futexv_head of one element */ ++ struct { ++ struct futex_waiter_head futexv; ++ struct futex_waiter waiter; ++ } __packed wait_single; + + if (flags & ~FUTEX2_MASK) + return -EINVAL; + -+ if (flags & FUTEX_SHARED_FLAG) -+ futex_flags |= FLAGS_SHARED; -+ -+ if (flags & FUTEX_CLOCK_REALTIME) -+ futex_flags |= FLAGS_CLOCKRT; -+ + if (size != FUTEX_32) + return -EINVAL; + -+ if (timo) { -+ if (get_timespec64(&ts, timo)) -+ return -EFAULT; ++ futexv = &wait_single.futexv; ++ futexv->task = current; ++ futexv->hint = false; + -+ if (!timespec64_valid(&ts)) -+ return -EINVAL; ++ waiter = &wait_single.waiter; ++ waiter->index = 0; ++ waiter->val = val; ++ waiter->uaddr = uaddr; ++ memset(&wait_single.waiter.key, 0, sizeof(struct futex_key)); + -+ time = timespec64_to_ktime(ts); -+ kt = &time; -+ } ++ INIT_LIST_HEAD(&waiter->list); + -+ return futex_wait(uaddr, futex_flags, val, kt, FUTEX_BITSET_MATCH_ANY); ++ /* Get an unlocked hash bucket */ ++ waiter->bucket = futex_get_bucket(uaddr, &waiter->key); ++ if (IS_ERR(waiter->bucket)) ++ return PTR_ERR(waiter->bucket); ++ ++ return futex_set_timer_and_wait(futexv, 1, timo, flags); +} + -+SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, u64, val, unsigned int, flags, -+ struct __kernel_timespec __user *, timo) ++/** ++ * futex_get_parent - For a given futex in a futexv list, get a pointer to the futexv ++ * @waiter: Address of futex in the list ++ * @index: Index of futex in the list ++ * ++ * Return: A pointer to its futexv struct ++ */ ++static inline struct futex_waiter_head *futex_get_parent(uintptr_t waiter, ++ unsigned int index) +{ -+ return ksys_futex_wait(uaddr, val, flags, timo); ++ uintptr_t parent = waiter - sizeof(struct futex_waiter_head) ++ - (uintptr_t)(index * sizeof(struct futex_waiter)); ++ ++ return (struct futex_waiter_head *)parent; +} + -+#ifdef CONFIG_COMPAT -+COMPAT_SYSCALL_DEFINE4(compat_futex_wait, void __user *, uaddr, compat_u64, val, -+ unsigned int, flags, -+ struct __kernel_timespec __user *, timo) ++/** ++ * futex_mark_wake - Find the task to be wake and add it in wake queue ++ * @waiter: Waiter to be wake ++ * @bucket: Bucket to be decremented ++ * @wake_q: Wake queue to insert the task ++ */ ++static void futex_mark_wake(struct futex_waiter *waiter, ++ struct futex_bucket *bucket, ++ struct wake_q_head *wake_q) +{ -+ return ksys_futex_wait(uaddr, val, flags, timo); -+} -+#endif ++ struct task_struct *task; ++ struct futex_waiter_head *parent = futex_get_parent((uintptr_t)waiter, ++ waiter->index); + ++ lockdep_assert_held(&bucket->lock); ++ parent->hint = true; ++ task = parent->task; ++ get_task_struct(task); ++ list_del_init(&waiter->list); ++ wake_q_add_safe(wake_q, task); ++ bucket_dec_waiters(bucket); ++} ++ ++static inline bool futex_match(struct futex_key key1, struct futex_key key2) ++{ ++ return (key1.index == key2.index && ++ key1.pointer == key2.pointer && ++ key1.offset == key2.offset); ++} ++ ++/** ++ * sys_futex_wake - Wake a number of futexes waiting on an address ++ * @uaddr: Address of futex to be woken up ++ * @nr_wake: Number of futexes waiting in uaddr to be woken up ++ * @flags: Flags for size and shared ++ * ++ * Wake `nr_wake` threads waiting at uaddr. ++ * ++ * Returns the number of woken threads on success, error code otherwise. ++ */ +SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + unsigned int, flags) +{ -+ unsigned int size = flags & FUTEX_SIZE_MASK, futex_flags = 0; ++ unsigned int size = flags & FUTEX_SIZE_MASK; ++ struct futex_waiter waiter, *aux, *tmp; ++ struct futex_bucket *bucket; ++ DEFINE_WAKE_Q(wake_q); ++ int ret = 0; + + if (flags & ~FUTEX2_MASK) + return -EINVAL; + -+ if (flags & FUTEX_SHARED_FLAG) -+ futex_flags |= FLAGS_SHARED; -+ + if (size != FUTEX_32) + return -EINVAL; + -+ return futex_wake(uaddr, futex_flags, nr_wake, FUTEX_BITSET_MATCH_ANY); ++ bucket = futex_get_bucket(uaddr, &waiter.key); ++ if (IS_ERR(bucket)) ++ return PTR_ERR(bucket); ++ ++ if (!bucket_get_waiters(bucket) || !nr_wake) ++ return 0; ++ ++ spin_lock(&bucket->lock); ++ list_for_each_entry_safe(aux, tmp, &bucket->list, list) { ++ if (futex_match(waiter.key, aux->key)) { ++ futex_mark_wake(aux, bucket, &wake_q); ++ if (++ret >= nr_wake) ++ break; ++ } ++ } ++ spin_unlock(&bucket->lock); ++ ++ wake_up_q(&wake_q); ++ ++ return ret; +} ++ ++static int __init futex2_init(void) ++{ ++ int i; ++ unsigned int futex_shift; ++ ++#if CONFIG_BASE_SMALL ++ futex2_hashsize = 16; ++#else ++ futex2_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); ++#endif ++ ++ futex_table = alloc_large_system_hash("futex2", sizeof(struct futex_bucket), ++ futex2_hashsize, 0, ++ futex2_hashsize < 256 ? HASH_SMALL : 0, ++ &futex_shift, NULL, ++ futex2_hashsize, futex2_hashsize); ++ futex2_hashsize = 1UL << futex_shift; ++ ++ BUG_ON(!is_power_of_2(futex2_hashsize)); ++ ++ for (i = 0; i < futex2_hashsize; i++) { ++ INIT_LIST_HEAD(&futex_table[i].list); ++ spin_lock_init(&futex_table[i].lock); ++ atomic_set(&futex_table[i].waiters, 0); ++ } ++ ++ return 0; ++} ++core_initcall(futex2_init); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c -index 0ea8128468c3..dbe397eaea46 100644 +index 0ea8128468c3..9addbe373f00 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c -@@ -151,6 +151,11 @@ COND_SYSCALL_COMPAT(set_robust_list); +@@ -151,6 +151,10 @@ COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); COND_SYSCALL_COMPAT(get_robust_list); +/* kernel/futex2.c */ +COND_SYSCALL(futex_wait); -+COND_SYSCALL_COMPAT(futex_wait); +COND_SYSCALL(futex_wake); + /* kernel/hrtimer.c */ /* kernel/itimer.c */ +diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h +index 6de5a7fc066b..2a62ecca2b00 100644 +--- a/tools/include/uapi/asm-generic/unistd.h ++++ b/tools/include/uapi/asm-generic/unistd.h +@@ -873,8 +873,14 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) + __SYSCALL(__NR_memfd_secret, sys_memfd_secret) + #endif + ++#define __NR_futex_wait 443 ++__SYSCALL(__NR_futex_wait, sys_futex_wait) ++ ++#define __NR_futex_wake 444 ++__SYSCALL(__NR_futex_wake, sys_futex_wake) ++ + #undef __NR_syscalls +-#define __NR_syscalls 448 ++#define __NR_syscalls 450 + + /* + * 32 bit systems traditionally used different +diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +index ce18119ea0d0..8eb17cc08a69 100644 +--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +@@ -368,6 +368,8 @@ + 445 common landlock_add_rule sys_landlock_add_rule + 446 common landlock_restrict_self sys_landlock_restrict_self + 447 common memfd_secret sys_memfd_secret ++448 common futex_wait sys_futex_wait ++449 common futex_wake sys_futex_wake + + # + # Due to a historical design error, certain syscalls are numbered differently -- -2.33.0 +2.31.1 -From 573fd06ce56a5be61d930d63a0243f2710ee31d3 Mon Sep 17 00:00:00 2001 + +From 24d84c5a45d3a5c5f3b6f2899bfe1c97e2380964 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Thu, 24 Jun 2021 10:43:51 -0300 -Subject: [PATCH 02/10] futex2: Implement vectorized wait +Date: Fri, 5 Feb 2021 10:34:01 -0300 +Subject: [PATCH 02/14] futex2: Add support for shared futexes +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add support for shared futexes for cross-process resources. This design +relies on the same approach done in old futex to create an unique id for +file-backed shared memory, by using a counter at struct inode. + +There are two types of futexes: private and shared ones. The private are futexes +meant to be used by threads that shares the same memory space, are easier to be +uniquely identified an thus can have some performance optimization. The elements +for identifying one are: the start address of the page where the address is, +the address offset within the page and the current->mm pointer. + +Now, for uniquely identifying shared futex: + +- If the page containing the user address is an anonymous page, we can + just use the same data used for private futexes (the start address of + the page, the address offset within the page and the current->mm + pointer) that will be enough for uniquely identifying such futex. We + also set one bit at the key to differentiate if a private futex is + used on the same address (mixing shared and private calls are not + allowed). + +- If the page is file-backed, current->mm maybe isn't the same one for + every user of this futex, so we need to use other data: the + page->index, an UUID for the struct inode and the offset within the + page. + +Note that members of futex_key doesn't have any particular meaning after they +are part of the struct - they are just bytes to identify a futex. Given that, +we don't need to use a particular name or type that matches the original data, +we only need to care about the bitsize of each component and make both private +and shared data fit in the same memory space. + +Signed-off-by: André Almeida +--- + fs/inode.c | 1 + + include/linux/fs.h | 1 + + include/uapi/linux/futex.h | 2 + + kernel/futex2.c | 222 ++++++++++++++++++++++++++++++++++++- + 4 files changed, 220 insertions(+), 6 deletions(-) + +diff --git a/fs/inode.c b/fs/inode.c +index c93500d84264..73e82a304d10 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -138,6 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic64_set(&inode->i_sequence, 0); ++ atomic64_set(&inode->i_sequence2, 0); + atomic_set(&inode->i_count, 1); + inode->i_op = &empty_iops; + inode->i_fop = &no_open_fops; +diff --git a/include/linux/fs.h b/include/linux/fs.h +index c3c88fdb9b2a..5dd112c04357 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -682,6 +682,7 @@ struct inode { + }; + atomic64_t i_version; + atomic64_t i_sequence; /* see futex */ ++ atomic64_t i_sequence2; /* see futex2 */ + atomic_t i_count; + atomic_t i_dio_count; + atomic_t i_writecount; +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index 8d30f4b6d094..70ea66fffb1c 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -46,6 +46,8 @@ + + #define FUTEX_SIZE_MASK 0x3 + ++#define FUTEX_SHARED_FLAG 8 ++ + /* + * Support for robust futexes: the kernel cleans up held futexes at + * thread exit time. +diff --git a/kernel/futex2.c b/kernel/futex2.c +index ade407c1abb7..51086d0c3fd5 100644 +--- a/kernel/futex2.c ++++ b/kernel/futex2.c +@@ -14,8 +14,10 @@ + */ + + #include ++#include + #include + #include ++#include + #include + #include + #include +@@ -23,8 +25,8 @@ + + /** + * struct futex_key - Components to build unique key for a futex +- * @pointer: Pointer to current->mm +- * @index: Start address of the page containing futex ++ * @pointer: Pointer to current->mm or inode's UUID for file backed futexes ++ * @index: Start address of the page containing futex or index of the page + * @offset: Address offset of uaddr in a page + */ + struct futex_key { +@@ -79,7 +81,12 @@ struct futex_bucket { + + + /* Mask for futex2 flag operations */ +-#define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_CLOCK_REALTIME) ++#define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_CLOCK_REALTIME | FUTEX_SHARED_FLAG) ++ ++#define is_object_shared ((futexv->objects[i].flags & FUTEX_SHARED_FLAG) ? true : false) ++ ++#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */ ++#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */ + + static struct futex_bucket *futex_table; + static unsigned int futex2_hashsize; +@@ -127,16 +134,200 @@ static inline int bucket_get_waiters(struct futex_bucket *bucket) + #endif + } + ++/** ++ * futex_get_inode_uuid - Gets an UUID for an inode ++ * @inode: inode to get UUID ++ * ++ * Generate a machine wide unique identifier for this inode. ++ * ++ * This relies on u64 not wrapping in the life-time of the machine; which with ++ * 1ns resolution means almost 585 years. ++ * ++ * This further relies on the fact that a well formed program will not unmap ++ * the file while it has a (shared) futex waiting on it. This mapping will have ++ * a file reference which pins the mount and inode. ++ * ++ * If for some reason an inode gets evicted and read back in again, it will get ++ * a new sequence number and will _NOT_ match, even though it is the exact same ++ * file. ++ * ++ * It is important that match_futex() will never have a false-positive, esp. ++ * for PI futexes that can mess up the state. The above argues that false-negatives ++ * are only possible for malformed programs. ++ * ++ * Returns: UUID for the given inode ++ */ ++static u64 futex_get_inode_uuid(struct inode *inode) ++{ ++ static atomic64_t i_seq; ++ u64 old; ++ ++ /* Does the inode already have a sequence number? */ ++ old = atomic64_read(&inode->i_sequence2); ++ ++ if (likely(old)) ++ return old; ++ ++ for (;;) { ++ u64 new = atomic64_add_return(1, &i_seq); ++ ++ if (WARN_ON_ONCE(!new)) ++ continue; ++ ++ old = atomic64_cmpxchg_relaxed(&inode->i_sequence2, 0, new); ++ if (old) ++ return old; ++ return new; ++ } ++} ++ ++/** ++ * futex_get_shared_key - Get a key for a shared futex ++ * @address: Futex memory address ++ * @mm: Current process mm_struct pointer ++ * @key: Key struct to be filled ++ * ++ * Returns: 0 on success, error code otherwise ++ */ ++static int futex_get_shared_key(uintptr_t address, struct mm_struct *mm, ++ struct futex_key *key) ++{ ++ int ret; ++ struct page *page, *tail; ++ struct address_space *mapping; ++ ++again: ++ ret = get_user_pages_fast(address, 1, 0, &page); ++ if (ret < 0) ++ return ret; ++ ++ /* ++ * The treatment of mapping from this point on is critical. The page ++ * lock protects many things but in this context the page lock ++ * stabilizes mapping, prevents inode freeing in the shared ++ * file-backed region case and guards against movement to swap cache. ++ * ++ * Strictly speaking the page lock is not needed in all cases being ++ * considered here and page lock forces unnecessarily serialization ++ * From this point on, mapping will be re-verified if necessary and ++ * page lock will be acquired only if it is unavoidable ++ * ++ * Mapping checks require the head page for any compound page so the ++ * head page and mapping is looked up now. For anonymous pages, it ++ * does not matter if the page splits in the future as the key is ++ * based on the address. For filesystem-backed pages, the tail is ++ * required as the index of the page determines the key. For ++ * base pages, there is no tail page and tail == page. ++ */ ++ tail = page; ++ page = compound_head(page); ++ mapping = READ_ONCE(page->mapping); ++ ++ /* ++ * If page->mapping is NULL, then it cannot be a PageAnon ++ * page; but it might be the ZERO_PAGE or in the gate area or ++ * in a special mapping (all cases which we are happy to fail); ++ * or it may have been a good file page when get_user_pages_fast ++ * found it, but truncated or holepunched or subjected to ++ * invalidate_complete_page2 before we got the page lock (also ++ * cases which we are happy to fail). And we hold a reference, ++ * so refcount care in invalidate_complete_page's remove_mapping ++ * prevents drop_caches from setting mapping to NULL beneath us. ++ * ++ * The case we do have to guard against is when memory pressure made ++ * shmem_writepage move it from filecache to swapcache beneath us: ++ * an unlikely race, but we do need to retry for page->mapping. ++ */ ++ if (unlikely(!mapping)) { ++ int shmem_swizzled; ++ ++ /* ++ * Page lock is required to identify which special case above ++ * applies. If this is really a shmem page then the page lock ++ * will prevent unexpected transitions. ++ */ ++ lock_page(page); ++ shmem_swizzled = PageSwapCache(page) || page->mapping; ++ unlock_page(page); ++ put_page(page); ++ ++ if (shmem_swizzled) ++ goto again; ++ ++ return -EFAULT; ++ } ++ ++ /* ++ * Private mappings are handled in a simple way. ++ * ++ * If the futex key is stored on an anonymous page, then the associated ++ * object is the mm which is implicitly pinned by the calling process. ++ * ++ * NOTE: When userspace waits on a MAP_SHARED mapping, even if ++ * it's a read-only handle, it's expected that futexes attach to ++ * the object not the particular process. ++ */ ++ if (PageAnon(page)) { ++ key->offset |= FUT_OFF_MMSHARED; ++ } else { ++ struct inode *inode; ++ ++ /* ++ * The associated futex object in this case is the inode and ++ * the page->mapping must be traversed. Ordinarily this should ++ * be stabilised under page lock but it's not strictly ++ * necessary in this case as we just want to pin the inode, not ++ * update the radix tree or anything like that. ++ * ++ * The RCU read lock is taken as the inode is finally freed ++ * under RCU. If the mapping still matches expectations then the ++ * mapping->host can be safely accessed as being a valid inode. ++ */ ++ rcu_read_lock(); ++ ++ if (READ_ONCE(page->mapping) != mapping) { ++ rcu_read_unlock(); ++ put_page(page); ++ ++ goto again; ++ } ++ ++ inode = READ_ONCE(mapping->host); ++ if (!inode) { ++ rcu_read_unlock(); ++ put_page(page); ++ ++ goto again; ++ } ++ ++ key->pointer = futex_get_inode_uuid(inode); ++ key->index = (unsigned long)page_to_pgoff(tail); ++ key->offset |= FUT_OFF_INODE; ++ ++ rcu_read_unlock(); ++ } ++ ++ put_page(page); ++ ++ return 0; ++} ++ + /** + * futex_get_bucket - Check if the user address is valid, prepare internal + * data and calculate the hash + * @uaddr: futex user address + * @key: data that uniquely identifies a futex ++ * @shared: is this a shared futex? ++ * ++ * For private futexes, each uaddr will be unique for a given mm_struct, and it ++ * won't be freed for the life time of the process. For shared futexes, check ++ * futex_get_shared_key(). + * + * Return: address of bucket on success, error code otherwise + */ + static struct futex_bucket *futex_get_bucket(void __user *uaddr, +- struct futex_key *key) ++ struct futex_key *key, ++ bool shared) + { + uintptr_t address = (uintptr_t)uaddr; + u32 hash_key; +@@ -152,6 +343,9 @@ static struct futex_bucket *futex_get_bucket(void __user *uaddr, + key->pointer = (u64)address; + key->index = (unsigned long)current->mm; + ++ if (shared) ++ futex_get_shared_key(address, current->mm, key); ++ + /* Generate hash key for this futex using uaddr and current->mm */ + hash_key = jhash2((u32 *)key, sizeof(*key) / sizeof(u32), 0); + +@@ -289,6 +483,7 @@ static int futex_enqueue(struct futex_waiter_head *futexv, unsigned int nr_futex + int i, ret; + u32 uval, val; + u32 __user *uaddr; ++ bool retry = false; + struct futex_bucket *bucket; + + retry: +@@ -298,6 +493,18 @@ static int futex_enqueue(struct futex_waiter_head *futexv, unsigned int nr_futex + uaddr = (u32 __user *)futexv->objects[i].uaddr; + val = (u32)futexv->objects[i].val; + ++ if (is_object_shared && retry) { ++ struct futex_bucket *tmp = ++ futex_get_bucket((void __user *)uaddr, ++ &futexv->objects[i].key, true); ++ if (IS_ERR(tmp)) { ++ __set_current_state(TASK_RUNNING); ++ futex_dequeue_multiple(futexv, i); ++ return PTR_ERR(tmp); ++ } ++ futexv->objects[i].bucket = tmp; ++ } ++ + bucket = futexv->objects[i].bucket; + + bucket_inc_waiters(bucket); +@@ -318,6 +525,7 @@ static int futex_enqueue(struct futex_waiter_head *futexv, unsigned int nr_futex + if (__get_user(uval, uaddr)) + return -EFAULT; + ++ retry = true; + goto retry; + } + +@@ -459,6 +667,7 @@ static int futex_set_timer_and_wait(struct futex_waiter_head *futexv, + SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + unsigned int, flags, struct __kernel_timespec __user *, timo) + { ++ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; + unsigned int size = flags & FUTEX_SIZE_MASK; + struct futex_waiter *waiter; + struct futex_waiter_head *futexv; +@@ -488,7 +697,7 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + INIT_LIST_HEAD(&waiter->list); + + /* Get an unlocked hash bucket */ +- waiter->bucket = futex_get_bucket(uaddr, &waiter->key); ++ waiter->bucket = futex_get_bucket(uaddr, &waiter->key, shared); + if (IS_ERR(waiter->bucket)) + return PTR_ERR(waiter->bucket); + +@@ -554,6 +763,7 @@ static inline bool futex_match(struct futex_key key1, struct futex_key key2) + SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + unsigned int, flags) + { ++ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; + unsigned int size = flags & FUTEX_SIZE_MASK; + struct futex_waiter waiter, *aux, *tmp; + struct futex_bucket *bucket; +@@ -566,7 +776,7 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + if (size != FUTEX_32) + return -EINVAL; + +- bucket = futex_get_bucket(uaddr, &waiter.key); ++ bucket = futex_get_bucket(uaddr, &waiter.key, shared); + if (IS_ERR(bucket)) + return PTR_ERR(bucket); + +-- +2.31.1 + + +From 649c033164d9a09f9ab682f579298b5f0449fe70 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Fri, 5 Feb 2021 10:34:00 -0300 +Subject: [PATCH 03/14] futex2: Implement vectorized wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -398,8 +1359,8 @@ futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes, unsigned int flags, struct timespec *timo) struct futex_waitv { - __u64 val; void *uaddr; + unsigned int val; unsigned int flags; }; @@ -407,32 +1368,61 @@ Given an array of struct futex_waitv, wait on each uaddr. The thread wakes if a futex_wake() is performed at any uaddr. The syscall returns immediately if any waiter has *uaddr != val. *timo is an optional timeout value for the operation. The flags argument of the syscall -should be used solely for specifying the timeout clock as realtime, if -needed. Flags for shared futexes, sizes, etc. should be used on the -individual flags of each waiter. +should be used solely for specifying the timeout as realtime, if needed. +Flags for shared futexes, sizes, etc. should be used on the individual +flags of each waiter. -Returns the array index of one of the awakened futexes. There’s no given +Returns the array index of one of the awakened futexes. There’s no given information of how many were awakened, or any particular attribute of it -(if it’s the first awakened, if it is of the smaller index...). ---- - arch/x86/entry/syscalls/syscall_32.tbl | 1 + - arch/x86/entry/syscalls/syscall_64.tbl | 1 + - include/linux/compat.h | 10 + - include/linux/futex.h | 78 ++++++ - include/uapi/asm-generic/unistd.h | 4 +- - include/uapi/linux/futex.h | 15 ++ - kernel/futex.c | 70 +---- - kernel/futex2.c | 347 +++++++++++++++++++++++++ - kernel/sys_ni.c | 2 + - 9 files changed, 464 insertions(+), 64 deletions(-) +(if it’s the first awakened, if it is of the smaller index...). +Signed-off-by: André Almeida + +Rebased-by: Joshua Ashton +--- + arch/arm/tools/syscall.tbl | 1 + + arch/arm64/include/asm/unistd.h | 2 +- + arch/x86/entry/syscalls/syscall_32.tbl | 1 + + arch/x86/entry/syscalls/syscall_64.tbl | 1 + + include/linux/compat.h | 11 ++ + include/linux/syscalls.h | 4 + + include/uapi/asm-generic/unistd.h | 5 +- + include/uapi/linux/futex.h | 14 ++ + kernel/futex2.c | 177 ++++++++++++++++++ + kernel/sys_ni.c | 1 + + tools/include/uapi/asm-generic/unistd.h | 5 +- + .../arch/x86/entry/syscalls/syscall_64.tbl | 1 + + 12 files changed, 220 insertions(+), 3 deletions(-) + +diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl +index b60a8bdab623..6e476c34bd00 100644 +--- a/arch/arm/tools/syscall.tbl ++++ b/arch/arm/tools/syscall.tbl +@@ -462,3 +462,4 @@ + 446 common landlock_restrict_self sys_landlock_restrict_self + 447 common futex_wait sys_futex_wait + 448 common futex_wake sys_futex_wake ++449 common futex_waitv sys_futex_waitv +diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h +index 3cb206aea3db..6bdb5f5db438 100644 +--- a/arch/arm64/include/asm/unistd.h ++++ b/arch/arm64/include/asm/unistd.h +@@ -38,7 +38,7 @@ + #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) + #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) + +-#define __NR_compat_syscalls 449 ++#define __NR_compat_syscalls 450 + #endif + + #define __ARCH_WANT_SYS_CLONE diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl -index e3b827a9c094..5573437c1914 100644 +index f75de79fa93d..b991991a434a 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -453,3 +453,4 @@ - 447 i386 memfd_secret sys_memfd_secret - 448 i386 futex_wait sys_futex_wait compat_sys_futex_wait + 447 common memfd_secret sys_memfd_secret + 448 i386 futex_wait sys_futex_wait 449 i386 futex_wake sys_futex_wake +450 i386 futex_waitv sys_futex_waitv compat_sys_futex_waitv diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl @@ -448,7 +1438,7 @@ index 63b447255df2..bad4aca3e9ba 100644 # # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/linux/compat.h b/include/linux/compat.h -index 8855b1b702b2..6e3abdde1c86 100644 +index 8855b1b702b2..06a40776d8a5 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -368,6 +368,12 @@ struct compat_robust_list_head { @@ -456,132 +1446,59 @@ index 8855b1b702b2..6e3abdde1c86 100644 }; +struct compat_futex_waitv { -+ compat_u64 val; + compat_uptr_t uaddr; ++ compat_uint_t val; + compat_uint_t flags; +}; + #ifdef CONFIG_COMPAT_OLD_SIGACTION struct compat_old_sigaction { compat_uptr_t sa_handler; -@@ -692,6 +698,10 @@ asmlinkage long +@@ -692,6 +698,11 @@ asmlinkage long compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, compat_size_t __user *len_ptr); -+ ++/* kernel/futex2.c */ +asmlinkage long compat_sys_futex_waitv(struct compat_futex_waitv *waiters, + compat_uint_t nr_futexes, compat_uint_t flags, + struct __kernel_timespec __user *timo); ++ /* kernel/itimer.c */ asmlinkage long compat_sys_getitimer(int which, struct old_itimerval32 __user *it); -diff --git a/include/linux/futex.h b/include/linux/futex.h -index abcc001f992a..18be31767024 100644 ---- a/include/linux/futex.h -+++ b/include/linux/futex.h -@@ -50,6 +50,45 @@ union futex_key { - } both; - }; +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index 0f9b64cc34f7..7d166f7304ae 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -71,6 +71,7 @@ struct open_how; + struct mount_attr; + struct landlock_ruleset_attr; + enum landlock_rule_type; ++struct futex_waitv; -+/** -+ * struct futex_q - The hashed futex queue entry, one per waiting task -+ * @list: priority-sorted list of tasks waiting on this futex -+ * @task: the task waiting on the futex -+ * @lock_ptr: the hash bucket lock -+ * @key: the key the futex is hashed on -+ * @pi_state: optional priority inheritance state -+ * @rt_waiter: rt_waiter storage for use with requeue_pi -+ * @requeue_pi_key: the requeue_pi target futex key -+ * @bitset: bitset for the optional bitmasked wakeup -+ * -+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so -+ * we can wake only the relevant ones (hashed queues may be shared). -+ * -+ * A futex_q has a woken state, just like tasks have TASK_RUNNING. -+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. -+ * The order of wakeup is always to make the first condition true, then -+ * the second. -+ * -+ * PI futexes are typically woken before they are removed from the hash list via -+ * the rt_mutex code. See unqueue_me_pi(). -+ */ -+struct futex_q { -+ struct plist_node list; -+ -+ struct task_struct *task; -+ spinlock_t *lock_ptr; -+ union futex_key key; -+ struct futex_pi_state *pi_state; -+ struct rt_mutex_waiter *rt_waiter; -+ union futex_key *requeue_pi_key; -+ u32 bitset; -+} __randomize_layout; -+ -+struct futex_vector { -+ struct futex_waitv w; -+ struct futex_q q; -+}; -+ - #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } } + #include + #include +@@ -629,6 +630,9 @@ asmlinkage long sys_futex_wait(void __user *uaddr, unsigned int val, + struct __kernel_timespec __user *timo); + asmlinkage long sys_futex_wake(void __user *uaddr, unsigned int nr_wake, + unsigned int flags); ++asmlinkage long sys_futex_waitv(struct futex_waitv __user *waiters, ++ unsigned int nr_futexes, unsigned int flags, ++ struct __kernel_timespec __user *timo); - #ifdef CONFIG_FUTEX -@@ -99,6 +138,45 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time - #define FLAGS_CLOCKRT 0x02 - #define FLAGS_HAS_TIMEOUT 0x04 - -+/* -+ * Hash buckets are shared by all the futex_keys that hash to the same -+ * location. Each key may have multiple futex_q structures, one for each task -+ * waiting on a futex. -+ */ -+struct futex_hash_bucket { -+ atomic_t waiters; -+ spinlock_t lock; -+ struct plist_head chain; -+} ____cacheline_aligned_in_smp; -+ -+void queue_me(struct futex_q *q, struct futex_hash_bucket *hb); -+ -+int unqueue_me(struct futex_q *q); -+ -+enum futex_access { -+ FUTEX_READ, -+ FUTEX_WRITE -+}; -+ -+int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, -+ enum futex_access rw); -+ -+struct futex_hash_bucket *queue_lock(struct futex_q *q); -+ -+struct hrtimer_sleeper *futex_setup_timer(ktime_t *time, -+ struct hrtimer_sleeper *timeout, -+ int flags, u64 range_ns); -+ -+void queue_unlock(struct futex_hash_bucket *hb); -+ -+int get_futex_value_locked(u32 *dest, u32 __user *from); -+ -+static const struct futex_q futex_q_init = { -+ /* list gets initialized in queue_me()*/ -+ .key = FUTEX_KEY_INIT, -+ .bitset = FUTEX_BITSET_MATCH_ANY -+}; -+ - #else - static inline void futex_init_task(struct task_struct *tsk) { } - static inline void futex_exit_recursive(struct task_struct *tsk) { } + /* kernel/hrtimer.c */ + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h -index df9fe2e23ee0..f9f79e5fe98f 100644 +index 2a62ecca2b00..1179d3f02d65 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h -@@ -876,9 +876,11 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) - __SC_COMP(__NR_futex_wait, sys_futex_wait, compat_sys_futex_wait) - #define __NR_futex_wake 449 +@@ -879,8 +879,11 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait) + #define __NR_futex_wake 444 __SYSCALL(__NR_futex_wake, sys_futex_wake) -+#define __NR_futex_wait 450 -+__SC_COMP(__NR_futex_waitv, sys_futex_waitv, compat_sys_futex_waitv) ++#define __NR_futex_waitv 445 ++__SC_COMP(__NR_futex_waitv, sys_futex_waitv, compat_sys_futex_waitv) ++ #undef __NR_syscalls -#define __NR_syscalls 450 +#define __NR_syscalls 451 @@ -589,375 +1506,51 @@ index df9fe2e23ee0..f9f79e5fe98f 100644 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index 44750caa261e..daa135bdedda 100644 +index 70ea66fffb1c..3216aee015d2 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h -@@ -45,6 +45,21 @@ - #define FUTEX_32 2 +@@ -48,6 +48,20 @@ + #define FUTEX_SHARED_FLAG 8 - #define FUTEX_SIZE_MASK 0x3 -+ + +#define FUTEX_WAITV_MAX 128 + +/** + * struct futex_waitv - A waiter for vectorized wait -+ * @val: Expected value at uaddr + * @uaddr: User address to wait on ++ * @val: Expected value at uaddr + * @flags: Flags for this waiter + */ +struct futex_waitv { -+ __u64 val; + void __user *uaddr; ++ unsigned int val; + unsigned int flags; +}; + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. -diff --git a/kernel/futex.c b/kernel/futex.c -index ef7131bd8bc4..2cd922ab82da 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -171,57 +171,6 @@ struct futex_pi_state { - union futex_key key; - } __randomize_layout; - --/** -- * struct futex_q - The hashed futex queue entry, one per waiting task -- * @list: priority-sorted list of tasks waiting on this futex -- * @task: the task waiting on the futex -- * @lock_ptr: the hash bucket lock -- * @key: the key the futex is hashed on -- * @pi_state: optional priority inheritance state -- * @rt_waiter: rt_waiter storage for use with requeue_pi -- * @requeue_pi_key: the requeue_pi target futex key -- * @bitset: bitset for the optional bitmasked wakeup -- * -- * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so -- * we can wake only the relevant ones (hashed queues may be shared). -- * -- * A futex_q has a woken state, just like tasks have TASK_RUNNING. -- * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. -- * The order of wakeup is always to make the first condition true, then -- * the second. -- * -- * PI futexes are typically woken before they are removed from the hash list via -- * the rt_mutex code. See unqueue_me_pi(). -- */ --struct futex_q { -- struct plist_node list; -- -- struct task_struct *task; -- spinlock_t *lock_ptr; -- union futex_key key; -- struct futex_pi_state *pi_state; -- struct rt_mutex_waiter *rt_waiter; -- union futex_key *requeue_pi_key; -- u32 bitset; --} __randomize_layout; -- --static const struct futex_q futex_q_init = { -- /* list gets initialized in queue_me()*/ -- .key = FUTEX_KEY_INIT, -- .bitset = FUTEX_BITSET_MATCH_ANY --}; -- --/* -- * Hash buckets are shared by all the futex_keys that hash to the same -- * location. Each key may have multiple futex_q structures, one for each task -- * waiting on a futex. -- */ --struct futex_hash_bucket { -- atomic_t waiters; -- spinlock_t lock; -- struct plist_head chain; --} ____cacheline_aligned_in_smp; -- - /* - * The base of the bucket array and its size are always used together - * (after initialization only in hash_futex()), so ensure that they -@@ -364,11 +313,6 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) - && key1->both.offset == key2->both.offset); - } - --enum futex_access { -- FUTEX_READ, -- FUTEX_WRITE --}; -- - /** - * futex_setup_timer - set up the sleeping hrtimer. - * @time: ptr to the given timeout value -@@ -379,7 +323,7 @@ enum futex_access { - * Return: Initialized hrtimer_sleeper structure or NULL if no timeout - * value given - */ --static inline struct hrtimer_sleeper * -+inline struct hrtimer_sleeper * - futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, - int flags, u64 range_ns) - { -@@ -465,7 +409,7 @@ static u64 get_inode_sequence_number(struct inode *inode) - * - * lock_page() might sleep, the caller should not hold a spinlock. - */ --static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, -+int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, - enum futex_access rw) - { - unsigned long address = (unsigned long)uaddr; -@@ -698,7 +642,7 @@ static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, - return ret; - } - --static int get_futex_value_locked(u32 *dest, u32 __user *from) -+inline int get_futex_value_locked(u32 *dest, u32 __user *from) - { - int ret; - -@@ -2173,7 +2117,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - } - - /* The key must be already stored in q->key. */ --static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) -+inline struct futex_hash_bucket *queue_lock(struct futex_q *q) - __acquires(&hb->lock) - { - struct futex_hash_bucket *hb; -@@ -2196,7 +2140,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) - return hb; - } - --static inline void -+inline void - queue_unlock(struct futex_hash_bucket *hb) - __releases(&hb->lock) - { -@@ -2235,7 +2179,7 @@ static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for - * an example). - */ --static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) -+inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) - { - __queue_me(q, hb); -@@ -2253,7 +2197,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - * - 1 - if the futex_q was still queued (and we removed unqueued it); - * - 0 - if the futex_q was already removed by the waking thread - */ --static int unqueue_me(struct futex_q *q) -+int unqueue_me(struct futex_q *q) - { - spinlock_t *lock_ptr; - int ret = 0; diff --git a/kernel/futex2.c b/kernel/futex2.c -index 4db771db48ee..e5afb5faf98d 100644 +index 51086d0c3fd5..beb2ce11ac83 100644 --- a/kernel/futex2.c +++ b/kernel/futex2.c -@@ -7,6 +7,8 @@ +@@ -83,6 +83,12 @@ struct futex_bucket { + /* Mask for futex2 flag operations */ + #define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_CLOCK_REALTIME | FUTEX_SHARED_FLAG) - #include - -+#include -+#include - #include - - /* -@@ -16,6 +17,352 @@ - */ - #define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG | FUTEX_CLOCK_REALTIME) - -+/* Mask for each futex in futex_waitv list */ -+#define FUTEXV_WAITER_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG) -+ +/* Mask for sys_futex_waitv flag */ +#define FUTEXV_MASK (FUTEX_CLOCK_REALTIME) + -+/** -+ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket -+ * @q: The list of futexes to unqueue -+ * @count: Number of futexes in the list -+ * -+ * Helper to unqueue a list of futexes. This can't fail. -+ * -+ * Return: -+ * - >=0 - Index of the last futex that was awoken; -+ * - -1 - If no futex was awoken -+ */ -+static int unqueue_multiple(struct futex_vector *v, int count) -+{ -+ int ret = -1, i; -+ -+ for (i = 0; i < count; i++) { -+ if (!unqueue_me(&v[i].q)) -+ ret = i; -+ } -+ -+ return ret; -+} -+ -+/** -+ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes -+ * @qs: The corresponding futex list -+ * @count: The size of the lists -+ * @flags: Futex flags (FLAGS_SHARED, etc.) -+ * @awaken: Index of the last awoken futex -+ * -+ * Prepare multiple futexes in a single step and enqueue them. This may fail if -+ * the futex list is invalid or if any futex was already awoken. On success the -+ * task is ready to interruptible sleep. -+ * -+ * Return: -+ * - 1 - One of the futexes was awaken by another thread -+ * - 0 - Success -+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL -+ */ -+static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *awaken) -+{ -+ struct futex_hash_bucket *hb; -+ int ret, i; -+ u32 uval; -+ -+ /* -+ * Enqueuing multiple futexes is tricky, because we need to -+ * enqueue each futex in the list before dealing with the next -+ * one to avoid deadlocking on the hash bucket. But, before -+ * enqueuing, we need to make sure that current->state is -+ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which -+ * cannot be done before the get_futex_key of the next key, -+ * because it calls get_user_pages, which can sleep. Thus, we -+ * fetch the list of futexes keys in two steps, by first pinning -+ * all the memory keys in the futex key, and only then we read -+ * each key and queue the corresponding futex. -+ */ -+retry: -+ for (i = 0; i < count; i++) { -+ ret = get_futex_key(vs[i].w.uaddr, -+ vs[i].w.flags & FUTEX_SHARED_FLAG, -+ &vs[i].q.key, FUTEX_READ); -+ if (unlikely(ret)) -+ return ret; -+ } -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ for (i = 0; i < count; i++) { -+ struct futex_q *q = &vs[i].q; -+ struct futex_waitv *waitv = &vs[i].w; -+ -+ hb = queue_lock(q); -+ ret = get_futex_value_locked(&uval, waitv->uaddr); -+ if (ret) { -+ /* -+ * We need to try to handle the fault, which -+ * cannot be done without sleep, so we need to -+ * undo all the work already done, to make sure -+ * we don't miss any wake ups. Therefore, clean -+ * up, handle the fault and retry from the -+ * beginning. -+ */ -+ queue_unlock(hb); -+ __set_current_state(TASK_RUNNING); -+ -+ *awaken = unqueue_multiple(vs, i); -+ if (*awaken >= 0) -+ return 1; -+ -+ if (get_user(uval, (u32 __user *)waitv->uaddr)) -+ return -EINVAL; -+ -+ goto retry; -+ } -+ -+ if (uval != waitv->val) { -+ queue_unlock(hb); -+ __set_current_state(TASK_RUNNING); -+ -+ /* -+ * If something was already awaken, we can -+ * safely ignore the error and succeed. -+ */ -+ *awaken = unqueue_multiple(vs, i); -+ if (*awaken >= 0) -+ return 1; -+ -+ return -EWOULDBLOCK; -+ } -+ -+ /* -+ * The bucket lock can't be held while dealing with the -+ * next futex. Queue each futex at this moment so hb can -+ * be unlocked. -+ */ -+ queue_me(&vs[i].q, hb); -+ } -+ return 0; -+} -+ -+/** -+ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes -+ * @qs: The list of futexes to wait on -+ * @op: Operation code from futex's syscall -+ * @count: The number of objects -+ * @abs_time: Timeout before giving up and returning to userspace -+ * -+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function -+ * sleeps on a group of futexes and returns on the first futex that -+ * triggered, or after the timeout has elapsed. -+ * -+ * Return: -+ * - >=0 - Hint to the futex that was awoken -+ * - <0 - On error -+ */ -+static int futex_wait_multiple(struct futex_vector *qs, unsigned int count, -+ struct hrtimer_sleeper *to) -+{ -+ int ret, hint = 0; -+ unsigned int i; -+ -+ while (1) { -+ ret = futex_wait_multiple_setup(qs, count, &hint); -+ if (ret) { -+ if (ret > 0) { -+ /* A futex was awaken during setup */ -+ ret = hint; -+ } -+ return ret; -+ } -+ -+ if (to) -+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); -+ -+ /* -+ * Avoid sleeping if another thread already tried to -+ * wake us. -+ */ -+ for (i = 0; i < count; i++) { -+ if (plist_node_empty(&qs[i].q.list)) -+ break; -+ } -+ -+ if (i == count && (!to || to->task)) -+ freezable_schedule(); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ ret = unqueue_multiple(qs, count); -+ if (ret >= 0) -+ return ret; -+ -+ if (to && !to->task) -+ return -ETIMEDOUT; -+ else if (signal_pending(current)) -+ return -ERESTARTSYS; -+ /* -+ * The final case is a spurious wakeup, for -+ * which just retry. -+ */ -+ } -+} ++/* Mask for each futex in futex_waitv list */ ++#define FUTEXV_WAITER_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG) + + #define is_object_shared ((futexv->objects[i].flags & FUTEX_SHARED_FLAG) ? true : false) + + #define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */ +@@ -704,6 +710,177 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + return futex_set_timer_and_wait(futexv, 1, timo, flags); + } + +#ifdef CONFIG_COMPAT +/** + * compat_futex_parse_waitv - Parse a waitv array from userspace @@ -967,25 +1560,38 @@ index 4db771db48ee..e5afb5faf98d 100644 + * + * Return: Error code on failure, pointer to a prepared futexv otherwise + */ -+static int compat_futex_parse_waitv(struct futex_vector *futexv, ++static int compat_futex_parse_waitv(struct futex_waiter_head *futexv, + struct compat_futex_waitv __user *uwaitv, + unsigned int nr_futexes) +{ -+ struct compat_futex_waitv aux; ++ struct futex_bucket *bucket; ++ struct compat_futex_waitv waitv; + unsigned int i; + + for (i = 0; i < nr_futexes; i++) { -+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) ++ if (copy_from_user(&waitv, &uwaitv[i], sizeof(waitv))) + return -EFAULT; + -+ if ((aux.flags & ~FUTEXV_WAITER_MASK) || -+ (aux.flags & FUTEX_SIZE_MASK) != FUTEX_32) ++ if ((waitv.flags & ~FUTEXV_WAITER_MASK) || ++ (waitv.flags & FUTEX_SIZE_MASK) != FUTEX_32) + return -EINVAL; + -+ futexv[i].w.flags = aux.flags; -+ futexv[i].w.val = aux.val; -+ futexv[i].w.uaddr = compat_ptr(aux.uaddr); -+ futexv[i].q = futex_q_init; ++ futexv->objects[i].key.pointer = 0; ++ futexv->objects[i].flags = waitv.flags; ++ futexv->objects[i].uaddr = compat_ptr(waitv.uaddr); ++ futexv->objects[i].val = waitv.val; ++ futexv->objects[i].index = i; ++ ++ bucket = futex_get_bucket(compat_ptr(waitv.uaddr), ++ &futexv->objects[i].key, ++ is_object_shared); ++ ++ if (IS_ERR(bucket)) ++ return PTR_ERR(bucket); ++ ++ futexv->objects[i].bucket = bucket; ++ ++ INIT_LIST_HEAD(&futexv->objects[i].list); + } + + return 0; @@ -995,10 +1601,7 @@ index 4db771db48ee..e5afb5faf98d 100644 + unsigned int, nr_futexes, unsigned int, flags, + struct __kernel_timespec __user *, timo) +{ -+ struct hrtimer_sleeper to; -+ struct futex_vector *futexv; -+ struct timespec64 ts; -+ ktime_t time; ++ struct futex_waiter_head *futexv; + int ret; + + if (flags & ~FUTEXV_MASK) @@ -1007,72 +1610,93 @@ index 4db771db48ee..e5afb5faf98d 100644 + if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) + return -EINVAL; + -+ if (timo) { -+ int flag_clkid = 0; -+ -+ if (get_timespec64(&ts, timo)) -+ return -EFAULT; -+ -+ if (!timespec64_valid(&ts)) -+ return -EINVAL; -+ -+ if (flags & FUTEX_CLOCK_REALTIME) -+ flag_clkid = FLAGS_CLOCKRT; -+ -+ time = timespec64_to_ktime(ts); -+ futex_setup_timer(&time, &to, flag_clkid, 0); -+ } -+ -+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); ++ futexv = kmalloc((sizeof(struct futex_waiter) * nr_futexes) + ++ sizeof(*futexv), GFP_KERNEL); + if (!futexv) + return -ENOMEM; + -+ ret = compat_futex_parse_waitv(futexv, waiters, nr_futexes); -+ if (!ret) -+ ret = futex_wait_multiple(futexv, nr_futexes, timo ? &to : NULL); ++ futexv->hint = false; ++ futexv->task = current; + -+ if (timo) { -+ hrtimer_cancel(&to.timer); -+ destroy_hrtimer_on_stack(&to.timer); -+ } ++ ret = compat_futex_parse_waitv(futexv, waiters, nr_futexes); ++ ++ if (!ret) ++ ret = futex_set_timer_and_wait(futexv, nr_futexes, timo, flags); + + kfree(futexv); ++ + return ret; +} +#endif + -+static int futex_parse_waitv(struct futex_vector *futexv, ++/** ++ * futex_parse_waitv - Parse a waitv array from userspace ++ * @futexv: Kernel side list of waiters to be filled ++ * @uwaitv: Userspace list to be parsed ++ * @nr_futexes: Length of futexv ++ * ++ * Return: Error code on failure, pointer to a prepared futexv otherwise ++ */ ++static int futex_parse_waitv(struct futex_waiter_head *futexv, + struct futex_waitv __user *uwaitv, + unsigned int nr_futexes) +{ -+ struct futex_waitv aux; ++ struct futex_bucket *bucket; ++ struct futex_waitv waitv; + unsigned int i; + + for (i = 0; i < nr_futexes; i++) { -+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) ++ if (copy_from_user(&waitv, &uwaitv[i], sizeof(waitv))) + return -EFAULT; + -+ if ((aux.flags & ~FUTEXV_WAITER_MASK) || -+ (aux.flags & FUTEX_SIZE_MASK) != FUTEX_32) ++ if ((waitv.flags & ~FUTEXV_WAITER_MASK) || ++ (waitv.flags & FUTEX_SIZE_MASK) != FUTEX_32) + return -EINVAL; + -+ futexv[i].w.flags = aux.flags; -+ futexv[i].w.val = aux.val; -+ futexv[i].w.uaddr = aux.uaddr; -+ futexv[i].q = futex_q_init; ++ futexv->objects[i].key.pointer = 0; ++ futexv->objects[i].flags = waitv.flags; ++ futexv->objects[i].uaddr = waitv.uaddr; ++ futexv->objects[i].val = waitv.val; ++ futexv->objects[i].index = i; ++ ++ bucket = futex_get_bucket(waitv.uaddr, &futexv->objects[i].key, ++ is_object_shared); ++ ++ if (IS_ERR(bucket)) ++ return PTR_ERR(bucket); ++ ++ futexv->objects[i].bucket = bucket; ++ ++ INIT_LIST_HEAD(&futexv->objects[i].list); + } + + return 0; +} + ++/** ++ * sys_futex_waitv - Wait on a list of futexes ++ * @waiters: List of futexes to wait on ++ * @nr_futexes: Length of futexv ++ * @flags: Flag for timeout (monotonic/realtime) ++ * @timo: Optional absolute timeout. ++ * ++ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes ++ * if a futex_wake() is performed at any uaddr. The syscall returns immediately ++ * if any waiter has *uaddr != val. *timo is an optional timeout value for the ++ * operation. Each waiter has individual flags. The `flags` argument for the ++ * syscall should be used solely for specifying the timeout as realtime, if ++ * needed. Flags for shared futexes, sizes, etc. should be used on the ++ * individual flags of each waiter. ++ * ++ * Returns the array index of one of the awaken futexes. There's no given ++ * information of how many were awakened, or any particular attribute of it (if ++ * it's the first awakened, if it is of the smaller index...). ++ */ +SYSCALL_DEFINE4(futex_waitv, struct futex_waitv __user *, waiters, + unsigned int, nr_futexes, unsigned int, flags, + struct __kernel_timespec __user *, timo) +{ -+ struct hrtimer_sleeper to; -+ struct futex_vector *futexv; -+ struct timespec64 ts; -+ ktime_t time; ++ struct futex_waiter_head *futexv; + int ret; + + if (flags & ~FUTEXV_MASK) @@ -1081,62 +1705,586 @@ index 4db771db48ee..e5afb5faf98d 100644 + if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) + return -EINVAL; + -+ if (timo) { -+ int flag_clkid = 0; -+ -+ if (get_timespec64(&ts, timo)) -+ return -EFAULT; -+ -+ if (!timespec64_valid(&ts)) -+ return -EINVAL; -+ -+ if (flags & FUTEX_CLOCK_REALTIME) -+ flag_clkid = FLAGS_CLOCKRT; -+ -+ time = timespec64_to_ktime(ts); -+ futex_setup_timer(&time, &to, flag_clkid, 0); -+ } -+ -+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); ++ futexv = kmalloc((sizeof(struct futex_waiter) * nr_futexes) + ++ sizeof(*futexv), GFP_KERNEL); + if (!futexv) + return -ENOMEM; + ++ futexv->hint = false; ++ futexv->task = current; ++ + ret = futex_parse_waitv(futexv, waiters, nr_futexes); + if (!ret) -+ ret = futex_wait_multiple(futexv, nr_futexes, timo ? &to : NULL); -+ -+ if (timo) { -+ hrtimer_cancel(&to.timer); -+ destroy_hrtimer_on_stack(&to.timer); -+ } ++ ret = futex_set_timer_and_wait(futexv, nr_futexes, timo, flags); + + kfree(futexv); ++ + return ret; +} + - static long ksys_futex_wait(void __user *uaddr, u64 val, unsigned int flags, - struct __kernel_timespec __user *timo) - { + /** + * futex_get_parent - For a given futex in a futexv list, get a pointer to the futexv + * @waiter: Address of futex in the list diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c -index dbe397eaea46..93807bb7be51 100644 +index 9addbe373f00..d70bb8cb884f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c -@@ -155,6 +155,8 @@ COND_SYSCALL_COMPAT(get_robust_list); +@@ -154,6 +154,7 @@ COND_SYSCALL_COMPAT(get_robust_list); + /* kernel/futex2.c */ COND_SYSCALL(futex_wait); - COND_SYSCALL_COMPAT(futex_wait); COND_SYSCALL(futex_wake); +COND_SYSCALL(futex_waitv); -+COND_SYSCALL_COMPAT(futex_waitv); + + /* kernel/hrtimer.c */ + +diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h +index 2a62ecca2b00..1179d3f02d65 100644 +--- a/tools/include/uapi/asm-generic/unistd.h ++++ b/tools/include/uapi/asm-generic/unistd.h +@@ -879,8 +879,11 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait) + #define __NR_futex_wake 444 + __SYSCALL(__NR_futex_wake, sys_futex_wake) + ++#define __NR_futex_waitv 445 ++__SC_COMP(__NR_futex_waitv, sys_futex_waitv, compat_sys_futex_waitv) ++ + #undef __NR_syscalls +-#define __NR_syscalls 450 ++#define __NR_syscalls 451 + + /* + * 32 bit systems traditionally used different +diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +index 8eb17cc08a69..faa5a3442e43 100644 +--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +@@ -370,6 +370,7 @@ + 447 common memfd_secret sys_memfd_secret + 448 common futex_wait sys_futex_wait + 449 common futex_wake sys_futex_wake ++450 common futex_waitv sys_futex_waitv + + # + # Due to a historical design error, certain syscalls are numbered differently +-- +2.31.1 + + +From 3f11c8e493c1c7a6602ed564ee4c5e074c90b10f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Fri, 5 Feb 2021 10:34:01 -0300 +Subject: [PATCH 04/14] futex2: Implement requeue operation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Implement requeue interface similary to FUTEX_CMP_REQUEUE operation. +This is the syscall implemented by this patch: + +futex_requeue(struct futex_requeue *uaddr1, struct futex_requeue *uaddr2, + unsigned int nr_wake, unsigned int nr_requeue, + unsigned int cmpval, unsigned int flags) + +struct futex_requeue { + void *uaddr; + unsigned int flags; +}; + +If (uaddr1->uaddr == cmpval), wake at uaddr1->uaddr a nr_wake number of +waiters and then, remove a number of nr_requeue waiters at uaddr1->uaddr +and add them to uaddr2->uaddr list. Each uaddr has its own set of flags, +that must be defined at struct futex_requeue (such as size, shared, NUMA). +The flags argument of the syscall is there just for the sake of +extensibility, and right now it needs to be zero. + +Return the number of the woken futexes + the number of requeued ones on +success, error code otherwise. + +Signed-off-by: André Almeida + +Rebased-by: Joshua Ashton +--- + arch/arm/tools/syscall.tbl | 1 + + arch/arm64/include/asm/unistd.h | 2 +- + arch/x86/entry/syscalls/syscall_32.tbl | 1 + + arch/x86/entry/syscalls/syscall_64.tbl | 1 + + include/linux/compat.h | 12 ++ + include/linux/syscalls.h | 5 + + include/uapi/asm-generic/unistd.h | 5 +- + include/uapi/linux/futex.h | 10 ++ + kernel/futex2.c | 215 +++++++++++++++++++++++++ + kernel/sys_ni.c | 1 + + 10 files changed, 251 insertions(+), 2 deletions(-) + +diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl +index 6e476c34bd00..25f175ada125 100644 +--- a/arch/arm/tools/syscall.tbl ++++ b/arch/arm/tools/syscall.tbl +@@ -463,3 +463,4 @@ + 447 common futex_wait sys_futex_wait + 448 common futex_wake sys_futex_wake + 449 common futex_waitv sys_futex_waitv ++450 common futex_requeue sys_futex_requeue +diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h +index 6bdb5f5db438..4e65da3445c7 100644 +--- a/arch/arm64/include/asm/unistd.h ++++ b/arch/arm64/include/asm/unistd.h +@@ -38,7 +38,7 @@ + #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) + #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) + +-#define __NR_compat_syscalls 450 ++#define __NR_compat_syscalls 451 + #endif + + #define __ARCH_WANT_SYS_CLONE +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index b991991a434a..1c3ca8b50247 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -454,3 +454,4 @@ + 448 i386 futex_wait sys_futex_wait + 449 i386 futex_wake sys_futex_wake + 450 i386 futex_waitv sys_futex_waitv compat_sys_futex_waitv ++451 i386 futex_requeue sys_futex_requeue compat_sys_futex_requeue +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index bad4aca3e9ba..a1a39ed156e8 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -371,6 +371,7 @@ + 448 common futex_wait sys_futex_wait + 449 common futex_wake sys_futex_wake + 450 common futex_waitv sys_futex_waitv ++451 common futex_requeue sys_futex_requeue + + # + # Due to a historical design error, certain syscalls are numbered differently +diff --git a/include/linux/compat.h b/include/linux/compat.h +index 06a40776d8a5..34ad63bac18d 100644 +--- a/include/linux/compat.h ++++ b/include/linux/compat.h +@@ -374,6 +374,11 @@ struct compat_futex_waitv { + compat_uint_t flags; + }; + ++struct compat_futex_requeue { ++ compat_uptr_t uaddr; ++ compat_uint_t flags; ++}; ++ + #ifdef CONFIG_COMPAT_OLD_SIGACTION + struct compat_old_sigaction { + compat_uptr_t sa_handler; +@@ -703,6 +708,13 @@ asmlinkage long compat_sys_futex_waitv(struct compat_futex_waitv *waiters, + compat_uint_t nr_futexes, compat_uint_t flags, + struct __kernel_timespec __user *timo); + ++asmlinkage long compat_sys_futex_requeue(struct compat_futex_requeue *uaddr1, ++ struct compat_futex_requeue *uaddr2, ++ compat_uint_t nr_wake, ++ compat_uint_t nr_requeue, ++ compat_uint_t cmpval, ++ compat_uint_t flags); ++ + /* kernel/itimer.c */ + asmlinkage long compat_sys_getitimer(int which, + struct old_itimerval32 __user *it); +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index 7d166f7304ae..aca64b5126a7 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -72,6 +72,7 @@ struct mount_attr; + struct landlock_ruleset_attr; + enum landlock_rule_type; + struct futex_waitv; ++struct futex_requeue; + + #include + #include +@@ -633,6 +634,10 @@ asmlinkage long sys_futex_wake(void __user *uaddr, unsigned int nr_wake, + asmlinkage long sys_futex_waitv(struct futex_waitv __user *waiters, + unsigned int nr_futexes, unsigned int flags, + struct __kernel_timespec __user *timo); ++asmlinkage long sys_futex_requeue(struct futex_requeue __user *uaddr1, ++ struct futex_requeue __user *uaddr2, ++ unsigned int nr_wake, unsigned int nr_requeue, ++ unsigned int cmpval, unsigned int flags); + + /* kernel/hrtimer.c */ + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index 1179d3f02d65..78d30c06b217 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -882,8 +882,11 @@ __SYSCALL(__NR_futex_wake, sys_futex_wake) + #define __NR_futex_waitv 445 + __SC_COMP(__NR_futex_waitv, sys_futex_waitv, compat_sys_futex_waitv) + ++#define __NR_futex_requeue 446 ++__SC_COMP(__NR_futex_requeue, sys_futex_requeue, compat_sys_futex_requeue) ++ + #undef __NR_syscalls +-#define __NR_syscalls 451 ++#define __NR_syscalls 452 + + /* + * 32 bit systems traditionally used different +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index 3216aee015d2..c15bfddcf1e2 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -62,6 +62,16 @@ struct futex_waitv { + unsigned int flags; + }; + ++/** ++ * struct futex_requeue - Define an address and its flags for requeue operation ++ * @uaddr: User address of one of the requeue arguments ++ * @flags: Flags for this address ++ */ ++struct futex_requeue { ++ void __user *uaddr; ++ unsigned int flags; ++}; ++ + /* + * Support for robust futexes: the kernel cleans up held futexes at + * thread exit time. +diff --git a/kernel/futex2.c b/kernel/futex2.c +index beb2ce11ac83..0d1db071c363 100644 +--- a/kernel/futex2.c ++++ b/kernel/futex2.c +@@ -975,6 +975,221 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + return ret; + } + ++static void futex_double_unlock(struct futex_bucket *b1, struct futex_bucket *b2) ++{ ++ spin_unlock(&b1->lock); ++ if (b1 != b2) ++ spin_unlock(&b2->lock); ++} ++ ++static inline int __futex_requeue(struct futex_requeue rq1, ++ struct futex_requeue rq2, unsigned int nr_wake, ++ unsigned int nr_requeue, unsigned int cmpval, ++ bool shared1, bool shared2) ++{ ++ struct futex_waiter w1, w2, *aux, *tmp; ++ bool retry = false; ++ struct futex_bucket *b1, *b2; ++ DEFINE_WAKE_Q(wake_q); ++ u32 uval; ++ int ret; ++ ++ b1 = futex_get_bucket(rq1.uaddr, &w1.key, shared1); ++ if (IS_ERR(b1)) ++ return PTR_ERR(b1); ++ ++ b2 = futex_get_bucket(rq2.uaddr, &w2.key, shared2); ++ if (IS_ERR(b2)) ++ return PTR_ERR(b2); ++ ++retry: ++ if (shared1 && retry) { ++ b1 = futex_get_bucket(rq1.uaddr, &w1.key, shared1); ++ if (IS_ERR(b1)) ++ return PTR_ERR(b1); ++ } ++ ++ if (shared2 && retry) { ++ b2 = futex_get_bucket(rq2.uaddr, &w2.key, shared2); ++ if (IS_ERR(b2)) ++ return PTR_ERR(b2); ++ } ++ ++ bucket_inc_waiters(b2); ++ /* ++ * To ensure the locks are taken in the same order for all threads (and ++ * thus avoiding deadlocks), take the "smaller" one first ++ */ ++ if (b1 <= b2) { ++ spin_lock(&b1->lock); ++ if (b1 < b2) ++ spin_lock_nested(&b2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ spin_lock(&b2->lock); ++ spin_lock_nested(&b1->lock, SINGLE_DEPTH_NESTING); ++ } ++ ++ ret = futex_get_user(&uval, rq1.uaddr); ++ ++ if (unlikely(ret)) { ++ futex_double_unlock(b1, b2); ++ if (__get_user(uval, (u32 __user *)rq1.uaddr)) ++ return -EFAULT; ++ ++ bucket_dec_waiters(b2); ++ retry = true; ++ goto retry; ++ } ++ ++ if (uval != cmpval) { ++ futex_double_unlock(b1, b2); ++ ++ bucket_dec_waiters(b2); ++ return -EAGAIN; ++ } ++ ++ list_for_each_entry_safe(aux, tmp, &b1->list, list) { ++ if (futex_match(w1.key, aux->key)) { ++ if (ret < nr_wake) { ++ futex_mark_wake(aux, b1, &wake_q); ++ ret++; ++ continue; ++ } ++ ++ if (ret >= nr_wake + nr_requeue) ++ break; ++ ++ aux->key.pointer = w2.key.pointer; ++ aux->key.index = w2.key.index; ++ aux->key.offset = w2.key.offset; ++ ++ if (b1 != b2) { ++ list_del_init_careful(&aux->list); ++ bucket_dec_waiters(b1); ++ ++ list_add_tail(&aux->list, &b2->list); ++ bucket_inc_waiters(b2); ++ } ++ ret++; ++ } ++ } ++ ++ futex_double_unlock(b1, b2); ++ wake_up_q(&wake_q); ++ bucket_dec_waiters(b2); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_COMPAT ++static int compat_futex_parse_requeue(struct futex_requeue *rq, ++ struct compat_futex_requeue __user *uaddr, ++ bool *shared) ++{ ++ struct compat_futex_requeue tmp; ++ ++ if (copy_from_user(&tmp, uaddr, sizeof(tmp))) ++ return -EFAULT; ++ ++ if (tmp.flags & ~FUTEXV_WAITER_MASK || ++ (tmp.flags & FUTEX_SIZE_MASK) != FUTEX_32) ++ return -EINVAL; ++ ++ *shared = (tmp.flags & FUTEX_SHARED_FLAG) ? true : false; ++ ++ rq->uaddr = compat_ptr(tmp.uaddr); ++ rq->flags = tmp.flags; ++ ++ return 0; ++} ++ ++COMPAT_SYSCALL_DEFINE6(futex_requeue, struct compat_futex_requeue __user *, uaddr1, ++ struct compat_futex_requeue __user *, uaddr2, ++ unsigned int, nr_wake, unsigned int, nr_requeue, ++ unsigned int, cmpval, unsigned int, flags) ++{ ++ struct futex_requeue rq1, rq2; ++ bool shared1, shared2; ++ int ret; ++ ++ if (flags) ++ return -EINVAL; ++ ++ ret = compat_futex_parse_requeue(&rq1, uaddr1, &shared1); ++ if (ret) ++ return ret; ++ ++ ret = compat_futex_parse_requeue(&rq2, uaddr2, &shared2); ++ if (ret) ++ return ret; ++ ++ return __futex_requeue(rq1, rq2, nr_wake, nr_requeue, cmpval, shared1, shared2); ++} ++#endif ++ ++/** ++ * futex_parse_requeue - Copy a user struct futex_requeue and check it's flags ++ * @rq: Kernel struct ++ * @uaddr: Address of user struct ++ * @shared: Out parameter, defines if this is a shared futex ++ * ++ * Return: 0 on success, error code otherwise ++ */ ++static int futex_parse_requeue(struct futex_requeue *rq, ++ struct futex_requeue __user *uaddr, bool *shared) ++{ ++ if (copy_from_user(rq, uaddr, sizeof(*rq))) ++ return -EFAULT; ++ ++ if (rq->flags & ~FUTEXV_WAITER_MASK || ++ (rq->flags & FUTEX_SIZE_MASK) != FUTEX_32) ++ return -EINVAL; ++ ++ *shared = (rq->flags & FUTEX_SHARED_FLAG) ? true : false; ++ ++ return 0; ++} ++ ++/** ++ * sys_futex_requeue - Wake futexes at uaddr1 and requeue from uaddr1 to uaddr2 ++ * @uaddr1: Address of futexes to be waken/dequeued ++ * @uaddr2: Address for the futexes to be enqueued ++ * @nr_wake: Number of futexes waiting in uaddr1 to be woken up ++ * @nr_requeue: Number of futexes to be requeued from uaddr1 to uaddr2 ++ * @cmpval: Expected value at uaddr1 ++ * @flags: Reserved flags arg for requeue operation expansion. Must be 0. ++ * ++ * If (uaddr1->uaddr == cmpval), wake at uaddr1->uaddr a nr_wake number of ++ * waiters and then, remove a number of nr_requeue waiters at uaddr1->uaddr ++ * and add then to uaddr2->uaddr list. Each uaddr has its own set of flags, ++ * that must be defined at struct futex_requeue (such as size, shared, NUMA). ++ * ++ * Return the number of the woken futexes + the number of requeued ones on ++ * success, error code otherwise. ++ */ ++SYSCALL_DEFINE6(futex_requeue, struct futex_requeue __user *, uaddr1, ++ struct futex_requeue __user *, uaddr2, ++ unsigned int, nr_wake, unsigned int, nr_requeue, ++ unsigned int, cmpval, unsigned int, flags) ++{ ++ struct futex_requeue rq1, rq2; ++ bool shared1, shared2; ++ int ret; ++ ++ if (flags) ++ return -EINVAL; ++ ++ ret = futex_parse_requeue(&rq1, uaddr1, &shared1); ++ if (ret) ++ return ret; ++ ++ ret = futex_parse_requeue(&rq2, uaddr2, &shared2); ++ if (ret) ++ return ret; ++ ++ return __futex_requeue(rq1, rq2, nr_wake, nr_requeue, cmpval, shared1, shared2); ++} ++ + static int __init futex2_init(void) + { + int i; +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index d70bb8cb884f..af0b1ef09d93 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -155,6 +155,7 @@ COND_SYSCALL_COMPAT(get_robust_list); + COND_SYSCALL(futex_wait); + COND_SYSCALL(futex_wake); + COND_SYSCALL(futex_waitv); ++COND_SYSCALL(futex_requeue); /* kernel/hrtimer.c */ -- -2.33.0 +2.31.1 -From 7d46d8d918d5ce7b43a4b918f5cf86a27df89bfd Mon Sep 17 00:00:00 2001 + +From 75ed26356ac56c0110ee39243b8c2948751cfd36 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 11 Feb 2021 10:47:23 -0300 +Subject: [PATCH 05/14] futex2: Add compatibility entry point for x86_x32 ABI +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +New syscalls should use the same entry point for x86_64 and x86_x32 +paths. Add a wrapper for x32 calls to use parse functions that assumes +32bit pointers. + +Signed-off-by: André Almeida +--- + kernel/futex2.c | 42 +++++++++++++++++++++++++++++++++++------- + 1 file changed, 35 insertions(+), 7 deletions(-) + +diff --git a/kernel/futex2.c b/kernel/futex2.c +index 0d1db071c363..22ba9b3e45e2 100644 +--- a/kernel/futex2.c ++++ b/kernel/futex2.c +@@ -23,6 +23,10 @@ + #include + #include + ++#ifdef CONFIG_X86_64 ++#include ++#endif ++ + /** + * struct futex_key - Components to build unique key for a futex + * @pointer: Pointer to current->mm or inode's UUID for file backed futexes +@@ -872,7 +876,16 @@ SYSCALL_DEFINE4(futex_waitv, struct futex_waitv __user *, waiters, + futexv->hint = false; + futexv->task = current; + +- ret = futex_parse_waitv(futexv, waiters, nr_futexes); ++#ifdef CONFIG_X86_X32_ABI ++ if (in_x32_syscall()) { ++ ret = compat_futex_parse_waitv(futexv, (struct compat_futex_waitv *)waiters, ++ nr_futexes); ++ } else ++#endif ++ { ++ ret = futex_parse_waitv(futexv, waiters, nr_futexes); ++ } ++ + if (!ret) + ret = futex_set_timer_and_wait(futexv, nr_futexes, timo, flags); + +@@ -1179,13 +1192,28 @@ SYSCALL_DEFINE6(futex_requeue, struct futex_requeue __user *, uaddr1, + if (flags) + return -EINVAL; + +- ret = futex_parse_requeue(&rq1, uaddr1, &shared1); +- if (ret) +- return ret; ++#ifdef CONFIG_X86_X32_ABI ++ if (in_x32_syscall()) { ++ ret = compat_futex_parse_requeue(&rq1, (struct compat_futex_requeue *)uaddr1, ++ &shared1); ++ if (ret) ++ return ret; + +- ret = futex_parse_requeue(&rq2, uaddr2, &shared2); +- if (ret) +- return ret; ++ ret = compat_futex_parse_requeue(&rq2, (struct compat_futex_requeue *)uaddr2, ++ &shared2); ++ if (ret) ++ return ret; ++ } else ++#endif ++ { ++ ret = futex_parse_requeue(&rq1, uaddr1, &shared1); ++ if (ret) ++ return ret; ++ ++ ret = futex_parse_requeue(&rq2, uaddr2, &shared2); ++ if (ret) ++ return ret; ++ } + + return __futex_requeue(rq1, rq2, nr_wake, nr_requeue, cmpval, shared1, shared2); + } +-- +2.31.1 + + +From ccdfc0a01aca5de728da256a2e5dea1d8a2ffc1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Tue, 9 Feb 2021 13:59:00 -0300 -Subject: [PATCH 03/10] docs: locking: futex2: Add documentation +Subject: [PATCH 06/14] docs: locking: futex2: Add documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -1144,26 +2292,26 @@ Content-Transfer-Encoding: 8bit Add a new documentation file specifying both userspace API and internal implementation details of futex2 syscalls. -Signed-off-by: André Almeida +Signed-off-by: André Almeida --- - Documentation/locking/futex2.rst | 185 +++++++++++++++++++++++++++++++ + Documentation/locking/futex2.rst | 198 +++++++++++++++++++++++++++++++ Documentation/locking/index.rst | 1 + - 2 files changed, 186 insertions(+) + 2 files changed, 199 insertions(+) create mode 100644 Documentation/locking/futex2.rst diff --git a/Documentation/locking/futex2.rst b/Documentation/locking/futex2.rst new file mode 100644 -index 000000000000..81b78284d70c +index 000000000000..13a7699bd6fc --- /dev/null +++ b/Documentation/locking/futex2.rst -@@ -0,0 +1,185 @@ +@@ -0,0 +1,198 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +futex2 +====== + -+:Author: André Almeida ++:Author: André Almeida + +futex, or fast user mutex, is a set of syscalls to allow userspace to create +performant synchronization mechanisms, such as mutexes, semaphores and @@ -1188,7 +2336,7 @@ index 000000000000..81b78284d70c +--------------------- + +The flag is used to specify the size of the futex word -+(FUTEX_[8, 16, 32, 64]). It's mandatory to define one, since there's no ++(FUTEX_[8, 16, 32]). It's mandatory to define one, since there's no +default size. + +By default, the timeout uses a monotonic clock, but can be used as a realtime @@ -1203,8 +2351,8 @@ index 000000000000..81b78284d70c +By default, the operation has no NUMA-awareness, meaning that the user can't +choose the memory node where the kernel side futex data will be stored. The +user can choose the node where it wants to operate by setting the -+FUTEX_NUMA_FLAG and using the following structure (where X can be 8, 16, 32 or -+64):: ++FUTEX_NUMA_FLAG and using the following structure (where X can be 8, 16, or ++32):: + + struct futexX_numa { + __uX value; @@ -1235,13 +2383,21 @@ index 000000000000..81b78284d70c +Implementation +============== + -+Kernel side implementation is made on top of current futex codebase. ++The internal implementation follows a similar design to the original futex. ++Given that we want to replicate the same external behavior of current futex, ++this should be somewhat expected. + +Waiting +------- + ++For the wait operations, they are all treated as if you want to wait on N ++futexes, so the path for futex_wait and futex_waitv is the basically the same. ++For both syscalls, the first step is to prepare an internal list for the list ++of futexes to wait for (using struct futexv_head). For futex_wait() calls, this ++list will have a single object. ++ +We have a hash table, where waiters register themselves before sleeping. Then -+the wake function checks this table looking for waiters at uaddr. The hash ++the wake function checks this table looking for waiters at uaddr. The hash +bucket to be used is determined by a struct futex_key, that stores information +to uniquely identify an address from a given process. Given the huge address +space, there'll be hash collisions, so we store information to be later used on @@ -1249,9 +2405,8 @@ index 000000000000..81b78284d70c + +First, for every futex we want to wait on, we check if (``*uaddr == val``). +This check is done holding the bucket lock, so we are correctly serialized with -+any futex_wake() calls. If any waiter fails the check above we return. For -+futex_waitv() calls, we dequeue all futexes queue until this point. The check -+(``*uaddr == val``) can fail for two reasons: ++any futex_wake() calls. If any waiter fails the check above, we dequeue all ++futexes. The check (``*uaddr == val``) can fail for two reasons: + +- The values are different, and we return -EAGAIN. However, if while + dequeueing we found that some futexes were awakened, we prioritize this @@ -1260,20 +2415,23 @@ index 000000000000..81b78284d70c +- When trying to access the user address, we do so with page faults + disabled because we are holding a bucket's spin lock (and can't sleep + while holding a spin lock). If there's an error, it might be a page -+ fault, or an invalid address. We release the lock, dequeue everyone if it's a -+ futex_waitv() call (because it's illegal to sleep while there are futexes -+ enqueued, we could lose wakeups) and try again with page fault enabled. If we -+ succeed, this means that the address is valid, but we need to do all the work -+ again. For serialization reasons, we need to have the spin lock when getting -+ the user value. Additionally, for shared futexes, we also need to recalculate -+ the hash, since the underlying mapping mechanisms could have changed when -+ dealing with page fault. If, even with page fault enabled, we can't access -+ the address, it means it's an invalid user address, and we return -EFAULT. ++ fault, or an invalid address. We release the lock, dequeue everyone ++ (because it's illegal to sleep while there are futexes enqueued, we ++ could lose wakeups) and try again with page fault enabled. If we ++ succeed, this means that the address is valid, but we need to do ++ all the work again. For serialization reasons, we need to have the ++ spin lock when getting the user value. Additionally, for shared ++ futexes, we also need to recalculate the hash, since the underlying ++ mapping mechanisms could have changed when dealing with page fault. ++ If, even with page fault enabled, we can't access the address, it ++ means it's an invalid user address, and we return -EFAULT. For this ++ case, we prioritize the error, even if some futexes were awaken. + +If the check is OK, they are enqueued on a linked list in our bucket, and +proceed to the next one. If all waiters succeed, we put the thread to sleep +until a futex_wake() call, timeout expires or we get a signal. After waking up, -+we dequeue everyone, and check if some futex was awakened. ++we dequeue everyone, and check if some futex was awakened. This dequeue is done ++by iteratively walking at each element of struct futex_head list. + +All enqueuing/dequeuing operations requires to hold the bucket lock, to avoid +racing while modifying the list. @@ -1335,7 +2493,10 @@ index 000000000000..81b78284d70c + page. + +Note that members of futex_key don't have any particular meaning after they -+are part of the struct - they are just bytes to identify a futex. ++are part of the struct - they are just bytes to identify a futex. Given that, ++we don't need to use a particular name or type that matches the original data, ++we only need to care about the bitsize of each component and make both private ++and shared fit in the same memory space. + +Source code documentation +========================= @@ -1355,12 +2516,13 @@ index 7003bd5aeff4..9bf03c7fa1ec 100644 .. only:: subproject and html -- -2.33.0 +2.31.1 -From 77d121d335cbf90530a864126f7b45107642409e Mon Sep 17 00:00:00 2001 + +From 213a8dc8b0266d98f95d7b5d642abbbf9a636d2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Fri, 5 Feb 2021 10:34:01 -0300 -Subject: [PATCH 04/10] selftests: futex2: Add wake/wait test +Subject: [PATCH 07/14] selftests: futex2: Add wake/wait test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -1375,31 +2537,31 @@ from glibc aren't yet able to use 64 bit sized time variables, add a temporary workaround that implements the required types and calls the appropriated syscalls, since futex2 doesn't supports 32 bit sized time. -Signed-off-by: André Almeida +Signed-off-by: André Almeida --- .../selftests/futex/functional/.gitignore | 1 + - .../selftests/futex/functional/Makefile | 4 +- - .../selftests/futex/functional/futex2_wait.c | 195 ++++++++++++++++++ + .../selftests/futex/functional/Makefile | 6 +- + .../selftests/futex/functional/futex2_wait.c | 209 ++++++++++++++++++ .../testing/selftests/futex/functional/run.sh | 3 + .../selftests/futex/include/futex2test.h | 79 +++++++ - 5 files changed, 281 insertions(+), 1 deletion(-) + 5 files changed, 296 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/futex/functional/futex2_wait.c create mode 100644 tools/testing/selftests/futex/include/futex2test.h diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore -index 0e78b49d0f2f..3e2d577c0595 100644 +index 0efcd494daab..d61f1df94360 100644 --- a/tools/testing/selftests/futex/functional/.gitignore +++ b/tools/testing/selftests/futex/functional/.gitignore -@@ -8,3 +8,4 @@ futex_wait_uninitialized_heap +@@ -6,3 +6,4 @@ futex_wait_private_mapped_file futex_wait_wouldblock futex_wait futex_requeue +futex2_wait diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile -index bd1fec59e010..e4e4aa2e0368 100644 +index 23207829ec75..9b334f190759 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile -@@ -6,6 +6,7 @@ LDLIBS := -lpthread -lrt +@@ -1,6 +1,7 @@ HEADERS := \ ../include/futextest.h \ @@ -1407,7 +2569,7 @@ index bd1fec59e010..e4e4aa2e0368 100644 ../include/atomic.h \ ../include/logging.h TEST_GEN_FILES := \ -@@ -17,7 +18,8 @@ TEST_GEN_FILES := \ +@@ -14,7 +15,8 @@ TEST_GEN_FILES := \ futex_wait_uninitialized_heap \ futex_wait_private_mapped_file \ futex_wait \ @@ -1419,10 +2581,10 @@ index bd1fec59e010..e4e4aa2e0368 100644 diff --git a/tools/testing/selftests/futex/functional/futex2_wait.c b/tools/testing/selftests/futex/functional/futex2_wait.c new file mode 100644 -index 000000000000..25ac6d0898f5 +index 000000000000..752a26b33bf8 --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex2_wait.c -@@ -0,0 +1,195 @@ +@@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/****************************************************************************** + * @@ -1432,10 +2594,10 @@ index 000000000000..25ac6d0898f5 + * Test wait/wake mechanism of futex2, using 32bit sized futexes. + * + * AUTHOR -+ * André Almeida ++ * André Almeida + * + * HISTORY -+ * 2021-Feb-5: Initial version by André ++ * 2021-Feb-5: Initial version by André + * + *****************************************************************************/ + @@ -1458,8 +2620,7 @@ index 000000000000..25ac6d0898f5 +#define timeout_ns 30000000 +#define WAKE_WAIT_US 10000 +#define SHM_PATH "futex2_shm_file" -+ -+void *futex; ++futex_t *f1; + +void usage(char *prog) +{ @@ -1470,7 +2631,7 @@ index 000000000000..25ac6d0898f5 + VQUIET, VCRITICAL, VINFO); +} + -+static void *waiterfn(void *arg) ++void *waiterfn(void *arg) +{ + struct timespec64 to64; + unsigned int flags = 0; @@ -1489,21 +2650,28 @@ index 000000000000..25ac6d0898f5 + to64.tv_nsec -= 1000000000; + } + -+ if (futex2_wait(futex, 0, FUTEX_32 | flags, &to64)) ++ if (futex2_wait(f1, *f1, FUTEX_32 | flags, &to64)) + printf("waiter failed errno %d\n", errno); + + return NULL; +} + ++void *waitershm(void *arg) ++{ ++ futex2_wait(arg, 0, FUTEX_32 | FUTEX_SHARED_FLAG, NULL); ++ ++ return NULL; ++} ++ +int main(int argc, char *argv[]) +{ -+ unsigned int flags = FUTEX_SHARED_FLAG; -+ int res, ret = RET_PASS, fd, c, shm_id; -+ u_int32_t f_private = 0, *shared_data; + pthread_t waiter; -+ void *shm; ++ unsigned int flags = FUTEX_SHARED_FLAG; ++ int res, ret = RET_PASS; ++ int c; ++ futex_t f_private = 0; + -+ futex = &f_private; ++ f1 = &f_private; + + while ((c = getopt(argc, argv, "cht:v:")) != -1) { + switch (c) { @@ -1524,62 +2692,75 @@ index 000000000000..25ac6d0898f5 + + ksft_print_header(); + ksft_set_plan(3); -+ ksft_print_msg("%s: Test FUTEX2_WAIT\n", basename(argv[0])); ++ ksft_print_msg("%s: Test FUTEX2_WAIT\n", ++ basename(argv[0])); + + /* Testing a private futex */ -+ info("Calling private futex2_wait on futex: %p\n", futex); ++ info("Calling private futex2_wait on f1: %u @ %p with val=%u\n", *f1, f1, *f1); ++ + if (pthread_create(&waiter, NULL, waiterfn, NULL)) + error("pthread_create failed\n", errno); + + usleep(WAKE_WAIT_US); + -+ info("Calling private futex2_wake on futex: %p\n", futex); -+ res = futex2_wake(futex, 1, FUTEX_32); ++ info("Calling private futex2_wake on f1: %u @ %p with val=%u\n", *f1, f1, *f1); ++ res = futex2_wake(f1, 1, FUTEX_32); + if (res != 1) { + ksft_test_result_fail("futex2_wake private returned: %d %s\n", -+ errno, strerror(errno)); ++ res ? errno : res, ++ res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { -+ ksft_test_result_pass("futex2_wake private\n"); ++ ksft_test_result_pass("futex2_wake private succeeds\n"); + } + -+ /* Testing an anon page shared memory */ -+ shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); ++ int shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); ++ + if (shm_id < 0) { + perror("shmget"); + exit(1); + } + -+ shared_data = shmat(shm_id, NULL, 0); ++ /* Testing an anon page shared memory */ ++ unsigned int *shared_data = shmat(shm_id, NULL, 0); + + *shared_data = 0; -+ futex = shared_data; ++ f1 = shared_data; ++ ++ info("Calling shared futex2_wait on f1: %u @ %p with val=%u\n", *f1, f1, *f1); + -+ info("Calling (page anon) shared futex2_wait on futex: %p\n", futex); + if (pthread_create(&waiter, NULL, waiterfn, &flags)) + error("pthread_create failed\n", errno); + + usleep(WAKE_WAIT_US); + -+ info("Calling (page anon) shared futex2_wake on futex: %p\n", futex); -+ res = futex2_wake(futex, 1, FUTEX_32 | FUTEX_SHARED_FLAG); ++ info("Calling shared futex2_wake on f1: %u @ %p with val=%u\n", *f1, f1, *f1); ++ res = futex2_wake(f1, 1, FUTEX_32 | FUTEX_SHARED_FLAG); + if (res != 1) { -+ ksft_test_result_fail("futex2_wake shared (page anon) returned: %d %s\n", -+ errno, strerror(errno)); ++ ksft_test_result_fail("futex2_wake shared (shmget) returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { -+ ksft_test_result_pass("futex2_wake shared (page anon)\n"); ++ ksft_test_result_pass("futex2_wake shared (shmget) succeeds\n"); + } + ++ shmdt(shared_data); + + /* Testing a file backed shared memory */ ++ void *shm; ++ int fd, pid; ++ ++ f_private = 0; ++ + fd = open(SHM_PATH, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (fd < 0) { + perror("open"); + exit(1); + } + -+ if (ftruncate(fd, sizeof(f_private))) { ++ res = ftruncate(fd, sizeof(f_private)); ++ if (res) { + perror("ftruncate"); + exit(1); + } @@ -1592,38 +2773,33 @@ index 000000000000..25ac6d0898f5 + + memcpy(shm, &f_private, sizeof(f_private)); + -+ futex = shm; -+ -+ info("Calling shared (file backed) futex2_wait on futex: %p\n", futex); -+ if (pthread_create(&waiter, NULL, waiterfn, &flags)) -+ error("pthread_create failed\n", errno); ++ pthread_create(&waiter, NULL, waitershm, shm); + + usleep(WAKE_WAIT_US); + -+ info("Calling shared (file backed) futex2_wake on futex: %p\n", futex); + res = futex2_wake(shm, 1, FUTEX_32 | FUTEX_SHARED_FLAG); + if (res != 1) { -+ ksft_test_result_fail("futex2_wake shared (file backed) returned: %d %s\n", -+ errno, strerror(errno)); ++ ksft_test_result_fail("futex2_wake shared (mmap) returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { -+ ksft_test_result_pass("futex2_wake shared (file backed)\n"); ++ ksft_test_result_pass("futex2_wake shared (mmap) succeeds\n"); + } + -+ /* Freeing resources */ -+ shmdt(shared_data); + munmap(shm, sizeof(f_private)); ++ + remove(SHM_PATH); + + ksft_print_cnts(); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh -index 11a9d62290f5..dbe82275617c 100755 +index 1acb6ace1680..3730159c865a 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh -@@ -79,3 +79,6 @@ echo - +@@ -73,3 +73,6 @@ echo + echo ./futex_requeue $COLOR + @@ -1631,7 +2807,7 @@ index 11a9d62290f5..dbe82275617c 100755 +./futex2_wait $COLOR diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h new file mode 100644 -index 000000000000..e724d56b917e +index 000000000000..917ac8909a3b --- /dev/null +++ b/tools/testing/selftests/futex/include/futex2test.h @@ -0,0 +1,79 @@ @@ -1644,10 +2820,10 @@ index 000000000000..e724d56b917e + * Futex2 library addons for old futex library + * + * AUTHOR -+ * André Almeida ++ * André Almeida + * + * HISTORY -+ * 2021-Feb-5: Initial version by André ++ * 2021-Feb-5: Initial version by André + * + *****************************************************************************/ +#include "futextest.h" @@ -1715,7 +2891,7 @@ index 000000000000..e724d56b917e + return syscall(__NR_futex_wake, uaddr, nr, flags); +} -- -2.33.0 +2.31.1 From ccc384997de6ef7014440514b7e7acfe5c35202c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= @@ -1739,7 +2915,7 @@ index 1f8f6daaf1e7..d20f54745c2e 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -17,6 +17,14 @@ - + #include #include "futextest.h" + @@ -1751,7 +2927,7 @@ index 1f8f6daaf1e7..d20f54745c2e 100644 +#include +#include "futex2test.h" #include "logging.h" - + #define TEST_NAME "futex-wait-timeout" @@ -92,8 +100,8 @@ static int futex_get_abs_timeout(clockid_t clockid, struct timespec *to, int main(int argc, char *argv[]) @@ -1762,10 +2938,10 @@ index 1f8f6daaf1e7..d20f54745c2e 100644 - struct timespec to; pthread_t thread; int c; - + @@ -118,7 +126,7 @@ int main(int argc, char *argv[]) } - + ksft_print_header(); - ksft_set_plan(7); + ksft_set_plan(9); @@ -1775,7 +2951,7 @@ index 1f8f6daaf1e7..d20f54745c2e 100644 @@ -175,6 +183,18 @@ int main(int argc, char *argv[]) res = futex_lock_pi(&futex_pi, NULL, 0, FUTEX_CLOCK_REALTIME); test_timeout(res, &ret, "futex_lock_pi invalid timeout flag", ENOSYS); - + + /* setting absolute monotonic timeout for futex2 */ + if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) + return RET_FAIL; @@ -1791,13 +2967,13 @@ index 1f8f6daaf1e7..d20f54745c2e 100644 ksft_print_cnts(); return ret; } --- +-- 2.33.0 -From 3a43b70d8b29c8b6842ac99001dee733751240f7 Mon Sep 17 00:00:00 2001 +From ffc9b6260a0a8f12da9aa20f3c0a91bf90e732aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Fri, 5 Feb 2021 10:34:01 -0300 -Subject: [PATCH 06/10] selftests: futex2: Add wouldblock test +Subject: [PATCH 09/14] selftests: futex2: Add wouldblock test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -1805,20 +2981,20 @@ Content-Transfer-Encoding: 8bit Adapt existing futex wait wouldblock file to test the same mechanism for futex2. -Signed-off-by: André Almeida +Signed-off-by: André Almeida --- .../futex/functional/futex_wait_wouldblock.c | 33 ++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -index 0ae390ff8164..510a98320248 100644 +index 0ae390ff8164..b1d463ebb33d 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -12,6 +12,7 @@ * * HISTORY * 2009-Nov-14: Initial version by Gowrishankar -+ * 2021-Feb-5: Add futex2 test by André ++ * 2021-Feb-5: Add futex2 test by André * *****************************************************************************/ @@ -1856,7 +3032,7 @@ index 0ae390ff8164..510a98320248 100644 res ? errno : res, res ? strerror(errno) : ""); ret = RET_FAIL; + } else { -+ ksft_test_result_pass("futex_wait wouldblock\n"); ++ ksft_test_result_pass("futex_wait wouldblock succeeds\n"); } - print_result(TEST_NAME, ret); @@ -1878,19 +3054,20 @@ index 0ae390ff8164..510a98320248 100644 + res ? errno : res, res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { -+ ksft_test_result_pass("futex2_wait wouldblock\n"); ++ ksft_test_result_pass("futex2_wait wouldblock succeeds\n"); + } + + ksft_print_cnts(); return ret; } -- -2.33.0 +2.31.1 -From 0df47d473c5ead4d65cbae25a4139f075e452c02 Mon Sep 17 00:00:00 2001 + +From 1b9fd688507408bd196b03ec96b6d5d303ed344b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Fri, 5 Feb 2021 10:34:02 -0300 -Subject: [PATCH 07/10] selftests: futex2: Add waitv test +Subject: [PATCH 10/14] selftests: futex2: Add waitv test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -1899,30 +3076,30 @@ Create a new file to test the waitv mechanism. Test both private and shared futexes. Wake the last futex in the array, and check if the return value from futex_waitv() is the right index. -Signed-off-by: André Almeida +Signed-off-by: André Almeida --- .../selftests/futex/functional/.gitignore | 1 + .../selftests/futex/functional/Makefile | 3 +- - .../selftests/futex/functional/futex2_waitv.c | 154 ++++++++++++++++++ + .../selftests/futex/functional/futex2_waitv.c | 157 ++++++++++++++++++ .../testing/selftests/futex/functional/run.sh | 3 + - .../selftests/futex/include/futex2test.h | 17 ++ - 5 files changed, 177 insertions(+), 1 deletion(-) + .../selftests/futex/include/futex2test.h | 26 +++ + 5 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/futex/functional/futex2_waitv.c diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore -index 3e2d577c0595..22c572de8d10 100644 +index d61f1df94360..d0b8f637b786 100644 --- a/tools/testing/selftests/futex/functional/.gitignore +++ b/tools/testing/selftests/futex/functional/.gitignore -@@ -9,3 +9,4 @@ futex_wait_wouldblock - futex_wait - futex_requeue +@@ -7,3 +7,4 @@ futex_wait_timeout + futex_wait_uninitialized_heap + futex_wait_wouldblock futex2_wait +futex2_waitv diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile -index e4e4aa2e0368..240b53d8cb07 100644 +index 9b334f190759..09c08ccdeaf2 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile -@@ -19,7 +19,8 @@ TEST_GEN_FILES := \ +@@ -16,7 +16,8 @@ TEST_GEN_FILES := \ futex_wait_private_mapped_file \ futex_wait \ futex_requeue \ @@ -1934,10 +3111,10 @@ index e4e4aa2e0368..240b53d8cb07 100644 diff --git a/tools/testing/selftests/futex/functional/futex2_waitv.c b/tools/testing/selftests/futex/functional/futex2_waitv.c new file mode 100644 -index 000000000000..0f625a0657d5 +index 000000000000..8ba74f1cbd51 --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex2_waitv.c -@@ -0,0 +1,154 @@ +@@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/****************************************************************************** + * @@ -1947,10 +3124,10 @@ index 000000000000..0f625a0657d5 + * Test waitv/wake mechanism of futex2, using 32bit sized futexes. + * + * AUTHOR -+ * André Almeida ++ * André Almeida + * + * HISTORY -+ * 2021-Feb-5: Initial version by André ++ * 2021-Feb-5: Initial version by André + * + *****************************************************************************/ + @@ -1967,6 +3144,7 @@ index 000000000000..0f625a0657d5 +#include "logging.h" + +#define TEST_NAME "futex2-wait" ++#define timeout_ns 1000000000 +#define WAKE_WAIT_US 10000 +#define NR_FUTEXES 30 +struct futex_waitv waitv[NR_FUTEXES]; @@ -1994,11 +3172,13 @@ index 000000000000..0f625a0657d5 + + res = futex2_waitv(waitv, NR_FUTEXES, 0, &to64); + if (res < 0) { -+ ksft_test_result_fail("futex2_waitv returned: %d %s\n", -+ errno, strerror(errno)); ++ ksft_test_result_fail("futex2_waitv private returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); + } else if (res != NR_FUTEXES - 1) { -+ ksft_test_result_fail("futex2_waitv returned: %d, expecting %d\n", -+ res, NR_FUTEXES - 1); ++ ksft_test_result_fail("futex2_waitv private returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); + } + + return NULL; @@ -2051,7 +3231,7 @@ index 000000000000..0f625a0657d5 + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { -+ ksft_test_result_pass("futex2_waitv private\n"); ++ ksft_test_result_pass("futex2_waitv private succeeds\n"); + } + + /* Shared waitv */ @@ -2083,7 +3263,7 @@ index 000000000000..0f625a0657d5 + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { -+ ksft_test_result_pass("futex2_waitv shared\n"); ++ ksft_test_result_pass("futex2_waitv shared succeeds\n"); + } + + for (i = 0; i < NR_FUTEXES; i++) @@ -2093,10 +3273,10 @@ index 000000000000..0f625a0657d5 + return ret; +} diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh -index dbe82275617c..6d30a30547da 100755 +index 3730159c865a..18b3883d7236 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh -@@ -82,3 +82,6 @@ echo +@@ -76,3 +76,6 @@ echo echo ./futex2_wait $COLOR @@ -2104,21 +3284,30 @@ index dbe82275617c..6d30a30547da 100755 +echo +./futex2_waitv $COLOR diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h -index e724d56b917e..0ed3b20935be 100644 +index 917ac8909a3b..7f847bd60594 100644 --- a/tools/testing/selftests/futex/include/futex2test.h +++ b/tools/testing/selftests/futex/include/futex2test.h -@@ -28,6 +28,10 @@ +@@ -28,6 +28,19 @@ # define FUTEX_32 2 #endif +#ifndef FUTEX_SHARED_FLAG +#define FUTEX_SHARED_FLAG 8 +#endif ++ ++#ifndef FUTEX_WAITV_MAX ++#define FUTEX_WAITV_MAX 128 ++struct futex_waitv { ++ void *uaddr; ++ unsigned int val; ++ unsigned int flags; ++}; ++#endif + /* * - Y2038 section for 32-bit applications - * -@@ -77,3 +81,16 @@ static inline int futex2_wake(volatile void *uaddr, unsigned int nr, unsigned lo +@@ -77,3 +90,16 @@ static inline int futex2_wake(volatile void *uaddr, unsigned int nr, unsigned lo { return syscall(__NR_futex_wake, uaddr, nr, flags); } @@ -2136,12 +3325,256 @@ index e724d56b917e..0ed3b20935be 100644 + return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo); +} -- -2.33.0 +2.31.1 -From 42718e2912cdb805020a6c0dc97c52e8e6ba4525 Mon Sep 17 00:00:00 2001 + +From 232e77c996fb8a19ef4511771568019d3545156f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Tue, 29 Jun 2021 16:17:42 -0300 -Subject: [PATCH 08/10] perf bench: Add futex2 benchmark tests +Date: Fri, 5 Feb 2021 10:34:02 -0300 +Subject: [PATCH 11/14] selftests: futex2: Add requeue test +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add testing for futex_requeue(). The first test just requeue from one +waiter to another one, and wake it. The second performs both wake and +requeue, and we check return values to see if the operation +woke/requeued the expected number of waiters. + +Signed-off-by: André Almeida +--- + .../selftests/futex/functional/.gitignore | 1 + + .../selftests/futex/functional/Makefile | 3 +- + .../futex/functional/futex2_requeue.c | 164 ++++++++++++++++++ + .../selftests/futex/include/futex2test.h | 16 ++ + 4 files changed, 183 insertions(+), 1 deletion(-) + create mode 100644 tools/testing/selftests/futex/functional/futex2_requeue.c + +diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore +index d0b8f637b786..af7557e821da 100644 +--- a/tools/testing/selftests/futex/functional/.gitignore ++++ b/tools/testing/selftests/futex/functional/.gitignore +@@ -8,3 +8,4 @@ futex_wait_uninitialized_heap + futex_wait_wouldblock + futex2_wait + futex2_waitv ++futex2_requeue +diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile +index 09c08ccdeaf2..3ccb9ea58ddd 100644 +--- a/tools/testing/selftests/futex/functional/Makefile ++++ b/tools/testing/selftests/futex/functional/Makefile +@@ -17,7 +17,8 @@ TEST_GEN_FILES := \ + futex_wait \ + futex_requeue \ + futex2_wait \ +- futex2_waitv ++ futex2_waitv \ ++ futex2_requeue + + TEST_PROGS := run.sh + +diff --git a/tools/testing/selftests/futex/functional/futex2_requeue.c b/tools/testing/selftests/futex/functional/futex2_requeue.c +new file mode 100644 +index 000000000000..05629c2257d0 +--- /dev/null ++++ b/tools/testing/selftests/futex/functional/futex2_requeue.c +@@ -0,0 +1,164 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/****************************************************************************** ++ * ++ * Copyright Collabora Ltd., 2021 ++ * ++ * DESCRIPTION ++ * Test requeue mechanism of futex2, using 32bit sized futexes. ++ * ++ * AUTHOR ++ * André Almeida ++ * ++ * HISTORY ++ * 2021-Feb-5: Initial version by André ++ * ++ *****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "futex2test.h" ++#include "logging.h" ++ ++#define TEST_NAME "futex2-wait" ++#define timeout_ns 30000000 ++#define WAKE_WAIT_US 10000 ++volatile futex_t *f1; ++ ++void usage(char *prog) ++{ ++ printf("Usage: %s\n", prog); ++ printf(" -c Use color\n"); ++ printf(" -h Display this help message\n"); ++ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", ++ VQUIET, VCRITICAL, VINFO); ++} ++ ++void *waiterfn(void *arg) ++{ ++ struct timespec64 to64; ++ ++ /* setting absolute timeout for futex2 */ ++ if (gettime64(CLOCK_MONOTONIC, &to64)) ++ error("gettime64 failed\n", errno); ++ ++ to64.tv_nsec += timeout_ns; ++ ++ if (to64.tv_nsec >= 1000000000) { ++ to64.tv_sec++; ++ to64.tv_nsec -= 1000000000; ++ } ++ ++ if (futex2_wait(f1, *f1, FUTEX_32, &to64)) ++ printf("waiter failed errno %d\n", errno); ++ ++ return NULL; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ pthread_t waiter[10]; ++ int res, ret = RET_PASS; ++ int c, i; ++ volatile futex_t _f1 = 0; ++ volatile futex_t f2 = 0; ++ struct futex_requeue r1, r2; ++ ++ f1 = &_f1; ++ ++ r1.flags = FUTEX_32; ++ r2.flags = FUTEX_32; ++ ++ r1.uaddr = f1; ++ r2.uaddr = &f2; ++ ++ while ((c = getopt(argc, argv, "cht:v:")) != -1) { ++ switch (c) { ++ case 'c': ++ log_color(1); ++ break; ++ case 'h': ++ usage(basename(argv[0])); ++ exit(0); ++ case 'v': ++ log_verbosity(atoi(optarg)); ++ break; ++ default: ++ usage(basename(argv[0])); ++ exit(1); ++ } ++ } ++ ++ ksft_print_header(); ++ ksft_set_plan(2); ++ ksft_print_msg("%s: Test FUTEX2_REQUEUE\n", ++ basename(argv[0])); ++ ++ /* ++ * Requeue a waiter from f1 to f2, and wake f2. ++ */ ++ if (pthread_create(&waiter[0], NULL, waiterfn, NULL)) ++ error("pthread_create failed\n", errno); ++ ++ usleep(WAKE_WAIT_US); ++ ++ res = futex2_requeue(&r1, &r2, 0, 1, 0, 0); ++ if (res != 1) { ++ ksft_test_result_fail("futex2_requeue private returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } ++ ++ ++ info("Calling private futex2_wake on f2: %u @ %p with val=%u\n", f2, &f2, f2); ++ res = futex2_wake(&f2, 1, FUTEX_32); ++ if (res != 1) { ++ ksft_test_result_fail("futex2_requeue private returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_requeue simple succeeds\n"); ++ } ++ ++ ++ /* ++ * Create 10 waiters at f1. At futex_requeue, wake 3 and requeue 7. ++ * At futex_wake, wake INT_MAX (should be exaclty 7). ++ */ ++ for (i = 0; i < 10; i++) { ++ if (pthread_create(&waiter[i], NULL, waiterfn, NULL)) ++ error("pthread_create failed\n", errno); ++ } ++ ++ usleep(WAKE_WAIT_US); ++ ++ res = futex2_requeue(&r1, &r2, 3, 7, 0, 0); ++ if (res != 10) { ++ ksft_test_result_fail("futex2_requeue private returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } ++ ++ res = futex2_wake(&f2, INT_MAX, FUTEX_32); ++ if (res != 7) { ++ ksft_test_result_fail("futex2_requeue private returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_requeue succeeds\n"); ++ } ++ ++ ksft_print_cnts(); ++ return ret; ++} +diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h +index 7f847bd60594..faa4027ce5b1 100644 +--- a/tools/testing/selftests/futex/include/futex2test.h ++++ b/tools/testing/selftests/futex/include/futex2test.h +@@ -103,3 +103,19 @@ static inline int futex2_waitv(volatile struct futex_waitv *waiters, unsigned lo + { + return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo); + } ++ ++/** ++ * futex2_requeue - Wake futexes at uaddr1 and requeue from uaddr1 to uaddr2 ++ * @uaddr1: Original address to wake and requeue from ++ * @uaddr2: Address to requeue to ++ * @nr_wake: Number of futexes to wake at uaddr1 before requeuing ++ * @nr_requeue: Number of futexes to requeue from uaddr1 to uaddr2 ++ * @cmpval: If (uaddr1->uaddr != cmpval), return immediatally ++ * @flgas: Operation flags ++ */ ++static inline int futex2_requeue(struct futex_requeue *uaddr1, struct futex_requeue *uaddr2, ++ unsigned int nr_wake, unsigned int nr_requeue, ++ unsigned int cmpval, unsigned long flags) ++{ ++ return syscall(__NR_futex_requeue, uaddr1, uaddr2, nr_wake, nr_requeue, cmpval, flags); ++} +-- +2.31.1 + + +From 34e8923658222740ed4357544cf38df3ea4a0bf2 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Fri, 5 Feb 2021 10:34:02 -0300 +Subject: [PATCH 12/14] perf bench: Add futex2 benchmark tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -2151,38 +3584,43 @@ futex2 calls. `perf bench` tests can be used not only as a way to measure the performance of implementation, but also as stress testing for the kernel infrastructure. -Signed-off-by: André Almeida +Signed-off-by: André Almeida --- - tools/arch/x86/include/asm/unistd_64.h | 8 +++++ - tools/perf/bench/bench.h | 3 ++ - tools/perf/bench/futex-hash.c | 24 ++++++++++++--- - tools/perf/bench/futex-wake-parallel.c | 41 ++++++++++++++++++++++---- - tools/perf/bench/futex-wake.c | 37 ++++++++++++++++++----- - tools/perf/bench/futex.h | 28 ++++++++++++++++++ - tools/perf/builtin-bench.c | 17 ++++++++--- - 7 files changed, 137 insertions(+), 21 deletions(-) + tools/arch/x86/include/asm/unistd_64.h | 12 ++++++ + tools/perf/bench/bench.h | 4 ++ + tools/perf/bench/futex-hash.c | 24 +++++++++-- + tools/perf/bench/futex-requeue.c | 57 ++++++++++++++++++++------ + tools/perf/bench/futex-wake-parallel.c | 41 +++++++++++++++--- + tools/perf/bench/futex-wake.c | 37 +++++++++++++---- + tools/perf/bench/futex.h | 47 +++++++++++++++++++++ + tools/perf/builtin-bench.c | 18 ++++++-- + 8 files changed, 206 insertions(+), 34 deletions(-) diff --git a/tools/arch/x86/include/asm/unistd_64.h b/tools/arch/x86/include/asm/unistd_64.h -index 4205ed4158bf..d056006095b2 100644 +index 4205ed4158bf..b65c51e8d675 100644 --- a/tools/arch/x86/include/asm/unistd_64.h +++ b/tools/arch/x86/include/asm/unistd_64.h -@@ -17,3 +17,11 @@ +@@ -17,3 +17,15 @@ #ifndef __NR_setns #define __NR_setns 308 #endif + +#ifndef __NR_futex_wait -+# define __NR_futex_wait 447 ++# define __NR_futex_wait 443 +#endif + +#ifndef __NR_futex_wake -+# define __NR_futex_wake 448 ++# define __NR_futex_wake 444 ++#endif ++ ++#ifndef __NR_futex_requeue ++# define __NR_futex_requeue 446 +#endif diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h -index eac36afab2b3..f6f881a05509 100644 +index eac36afab2b3..12346844b354 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h -@@ -38,8 +38,11 @@ int bench_mem_memcpy(int argc, const char **argv); +@@ -38,9 +38,13 @@ int bench_mem_memcpy(int argc, const char **argv); int bench_mem_memset(int argc, const char **argv); int bench_mem_find_bit(int argc, const char **argv); int bench_futex_hash(int argc, const char **argv); @@ -2192,8 +3630,10 @@ index eac36afab2b3..f6f881a05509 100644 int bench_futex_wake_parallel(int argc, const char **argv); +int bench_futex2_wake_parallel(int argc, const char **argv); int bench_futex_requeue(int argc, const char **argv); ++int bench_futex2_requeue(int argc, const char **argv); /* pi futexes */ int bench_futex_lock_pi(int argc, const char **argv); + int bench_epoll_wait(int argc, const char **argv); diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index b65373ce5c4f..1068749af40c 100644 --- a/tools/perf/bench/futex-hash.c @@ -2254,6 +3694,136 @@ index b65373ce5c4f..1068749af40c 100644 + futex2 = true; + return __bench_futex_hash(argc, argv); +} +diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c +index 5fa23295ee5f..6cdd649b54f4 100644 +--- a/tools/perf/bench/futex-requeue.c ++++ b/tools/perf/bench/futex-requeue.c +@@ -2,8 +2,8 @@ + /* + * Copyright (C) 2013 Davidlohr Bueso + * +- * futex-requeue: Block a bunch of threads on futex1 and requeue them +- * on futex2, N at a time. ++ * futex-requeue: Block a bunch of threads on addr1 and requeue them ++ * on addr2, N at a time. + * + * This program is particularly useful to measure the latency of nthread + * requeues without waking up any tasks -- thus mimicking a regular futex_wait. +@@ -28,7 +28,10 @@ + #include + #include + +-static u_int32_t futex1 = 0, futex2 = 0; ++static u_int32_t addr1 = 0, addr2 = 0; ++ ++static struct futex_requeue rq1 = { .uaddr = &addr1, .flags = FUTEX_32 }; ++static struct futex_requeue rq2 = { .uaddr = &addr2, .flags = FUTEX_32 }; + + /* + * How many tasks to requeue at a time. +@@ -37,7 +40,7 @@ static u_int32_t futex1 = 0, futex2 = 0; + static unsigned int nrequeue = 1; + + static pthread_t *worker; +-static bool done = false, silent = false, fshared = false; ++static bool done = false, silent = false, fshared = false, futex2 = false; + static pthread_mutex_t thread_lock; + static pthread_cond_t thread_parent, thread_worker; + static struct stats requeuetime_stats, requeued_stats; +@@ -79,7 +82,11 @@ static void *workerfn(void *arg __maybe_unused) + pthread_cond_wait(&thread_worker, &thread_lock); + pthread_mutex_unlock(&thread_lock); + +- futex_wait(&futex1, 0, NULL, futex_flag); ++ if (!futex2) ++ futex_wait(&addr1, 0, NULL, futex_flag); ++ else ++ futex2_wait(&addr1, 0, futex_flag, NULL); ++ + return NULL; + } + +@@ -111,7 +118,7 @@ static void toggle_done(int sig __maybe_unused, + done = true; + } + +-int bench_futex_requeue(int argc, const char **argv) ++static int __bench_futex_requeue(int argc, const char **argv) + { + int ret = 0; + unsigned int i, j; +@@ -139,15 +146,20 @@ int bench_futex_requeue(int argc, const char **argv) + if (!worker) + err(EXIT_FAILURE, "calloc"); + +- if (!fshared) ++ if (futex2) { ++ futex_flag = FUTEX_32 | (fshared * FUTEX_SHARED_FLAG); ++ rq1.flags |= FUTEX_SHARED_FLAG * fshared; ++ rq2.flags |= FUTEX_SHARED_FLAG * fshared; ++ } else if (!fshared) { + futex_flag = FUTEX_PRIVATE_FLAG; ++ } + + if (nrequeue > nthreads) + nrequeue = nthreads; + + printf("Run summary [PID %d]: Requeuing %d threads (from [%s] %p to %p), " + "%d at a time.\n\n", getpid(), nthreads, +- fshared ? "shared":"private", &futex1, &futex2, nrequeue); ++ fshared ? "shared":"private", &addr1, &addr2, nrequeue); + + init_stats(&requeued_stats); + init_stats(&requeuetime_stats); +@@ -176,11 +188,15 @@ int bench_futex_requeue(int argc, const char **argv) + gettimeofday(&start, NULL); + while (nrequeued < nthreads) { + /* +- * Do not wakeup any tasks blocked on futex1, allowing ++ * Do not wakeup any tasks blocked on addr1, allowing + * us to really measure futex_wait functionality. + */ +- nrequeued += futex_cmp_requeue(&futex1, 0, &futex2, 0, +- nrequeue, futex_flag); ++ if (!futex2) ++ nrequeued += futex_cmp_requeue(&addr1, 0, &addr2, ++ 0, nrequeue, futex_flag); ++ else ++ nrequeued += futex2_requeue(&rq1, &rq2, ++ 0, nrequeue, 0, 0); + } + + gettimeofday(&end, NULL); +@@ -194,8 +210,12 @@ int bench_futex_requeue(int argc, const char **argv) + j + 1, nrequeued, nthreads, runtime.tv_usec / (double)USEC_PER_MSEC); + } + +- /* everybody should be blocked on futex2, wake'em up */ +- nrequeued = futex_wake(&futex2, nrequeued, futex_flag); ++ /* everybody should be blocked on addr2, wake'em up */ ++ if (!futex2) ++ nrequeued = futex_wake(&addr2, nrequeued, futex_flag); ++ else ++ nrequeued = futex2_wake(&addr2, nrequeued, futex_flag); ++ + if (nthreads != nrequeued) + warnx("couldn't wakeup all tasks (%d/%d)", nrequeued, nthreads); + +@@ -220,3 +240,14 @@ int bench_futex_requeue(int argc, const char **argv) + usage_with_options(bench_futex_requeue_usage, options); + exit(EXIT_FAILURE); + } ++ ++int bench_futex_requeue(int argc, const char **argv) ++{ ++ return __bench_futex_requeue(argc, argv); ++} ++ ++int bench_futex2_requeue(int argc, const char **argv) ++{ ++ futex2 = true; ++ return __bench_futex_requeue(argc, argv); ++} diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index 6e6f5247e1fe..cac90fc0bfb3 100644 --- a/tools/perf/bench/futex-wake-parallel.c @@ -2428,10 +3998,10 @@ index 6d217868f53c..546d2818eed8 100644 + return __bench_futex_wake(argc, argv); +} diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h -index 31b53cc7d5bc..daae47033584 100644 +index 31b53cc7d5bc..6b2213cf3f64 100644 --- a/tools/perf/bench/futex.h +++ b/tools/perf/bench/futex.h -@@ -86,4 +86,32 @@ futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wak +@@ -86,4 +86,51 @@ futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wak return futex(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2, val, opflags); } @@ -2462,10 +4032,29 @@ index 31b53cc7d5bc..daae47033584 100644 +static inline int futex2_wake(volatile void *uaddr, unsigned int nr, unsigned long flags) +{ + return syscall(__NR_futex_wake, uaddr, nr, flags); ++} ++ ++/** ++ * futex2_requeue - Requeue waiters from an address to another one ++ * @uaddr1: Address where waiters are currently waiting on ++ * @uaddr2: New address to wait ++ * @nr_wake: Number of waiters at uaddr1 to be wake ++ * @nr_requeue: After waking nr_wake, number of waiters to be requeued ++ * @cmpval: Expected value at uaddr1 ++ * @flags: Operation options ++ * ++ * Return: waked futexes + requeued futexes at uaddr1 ++ */ ++static inline int futex2_requeue(volatile struct futex_requeue *uaddr1, ++ volatile struct futex_requeue *uaddr2, ++ unsigned int nr_wake, unsigned int nr_requeue, ++ unsigned int cmpval, unsigned long flags) ++{ ++ return syscall(__NR_futex_requeue, uaddr1, uaddr2, nr_wake, nr_requeue, cmpval, flags); +} #endif /* _FUTEX_H */ diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c -index 62a7b7420a44..200ecacad841 100644 +index 62a7b7420a44..e41a95ad2db6 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -12,10 +12,11 @@ @@ -2484,7 +4073,7 @@ index 62a7b7420a44..200ecacad841 100644 */ #include #include "builtin.h" -@@ -75,6 +76,13 @@ static struct bench futex_benchmarks[] = { +@@ -75,6 +76,14 @@ static struct bench futex_benchmarks[] = { { NULL, NULL, NULL } }; @@ -2492,13 +4081,14 @@ index 62a7b7420a44..200ecacad841 100644 + { "hash", "Benchmark for futex2 hash table", bench_futex2_hash }, + { "wake", "Benchmark for futex2 wake calls", bench_futex2_wake }, + { "wake-parallel", "Benchmark for parallel futex2 wake calls", bench_futex2_wake_parallel }, ++ { "requeue", "Benchmark for futex2 requeue calls", bench_futex2_requeue }, + { NULL, NULL, NULL } +}; + #ifdef HAVE_EVENTFD_SUPPORT static struct bench epoll_benchmarks[] = { { "wait", "Benchmark epoll concurrent epoll_waits", bench_epoll_wait }, -@@ -105,6 +113,7 @@ static struct collection collections[] = { +@@ -105,6 +114,7 @@ static struct collection collections[] = { { "numa", "NUMA scheduling and MM benchmarks", numa_benchmarks }, #endif {"futex", "Futex stressing benchmarks", futex_benchmarks }, @@ -2507,12 +4097,113 @@ index 62a7b7420a44..200ecacad841 100644 {"epoll", "Epoll stressing benchmarks", epoll_benchmarks }, #endif -- -2.33.0 +2.31.1 -From be703ffe14a9562140272abe2e0fa4abd3e52e0d Mon Sep 17 00:00:00 2001 + +From 04b171b8aae7843cc1cc15d4f41188626382548b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Fri, 5 Feb 2021 10:34:02 -0300 -Subject: [PATCH 09/10] futex2: Add sysfs entry for syscall numbers +Subject: [PATCH 13/14] kernel: Enable waitpid() for futex2 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +To make pthreads works as expected if they are using futex2, wake +clear_child_tid with futex2 as well. This is make applications that uses +waitpid() (and clone(CLONE_CHILD_SETTID)) wake while waiting for the +child to terminate. Given that apps should not mix futex() and futex2(), +any correct app will trigger a harmless noop wakeup on the interface +that it isn't using. + +Signed-off-by: André Almeida +--- + include/linux/syscalls.h | 2 ++ + kernel/fork.c | 2 ++ + kernel/futex2.c | 30 ++++++++++++++++++------------ + 3 files changed, 22 insertions(+), 12 deletions(-) + +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index aca64b5126a7..a0a9748b0236 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -1325,6 +1325,8 @@ int ksys_ipc(unsigned int call, int first, unsigned long second, + unsigned long third, void __user * ptr, long fifth); + int compat_ksys_ipc(u32 call, int first, int second, + u32 third, u32 ptr, u32 fifth); ++long ksys_futex_wake(void __user *uaddr, unsigned long nr_wake, ++ unsigned int flags); + + /* + * The following kernel syscall equivalents are just wrappers to fs-internal +diff --git a/kernel/fork.c b/kernel/fork.c +index dc06afd725cb..344430d882b1 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1322,6 +1322,8 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm) + put_user(0, tsk->clear_child_tid); + do_futex(tsk->clear_child_tid, FUTEX_WAKE, + 1, NULL, NULL, 0, 0); ++ ksys_futex_wake(tsk->clear_child_tid, 1, ++ FUTEX_32 | FUTEX_SHARED_FLAG); + } + tsk->clear_child_tid = NULL; + } +diff --git a/kernel/futex2.c b/kernel/futex2.c +index 22ba9b3e45e2..25f5dea49ad7 100644 +--- a/kernel/futex2.c ++++ b/kernel/futex2.c +@@ -940,18 +940,8 @@ static inline bool futex_match(struct futex_key key1, struct futex_key key2) + key1.offset == key2.offset); + } + +-/** +- * sys_futex_wake - Wake a number of futexes waiting on an address +- * @uaddr: Address of futex to be woken up +- * @nr_wake: Number of futexes waiting in uaddr to be woken up +- * @flags: Flags for size and shared +- * +- * Wake `nr_wake` threads waiting at uaddr. +- * +- * Returns the number of woken threads on success, error code otherwise. +- */ +-SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, +- unsigned int, flags) ++long ksys_futex_wake(void __user *uaddr, unsigned long nr_wake, ++ unsigned int flags) + { + bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; + unsigned int size = flags & FUTEX_SIZE_MASK; +@@ -988,6 +978,22 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + return ret; + } + ++/** ++ * sys_futex_wake - Wake a number of futexes waiting on an address ++ * @uaddr: Address of futex to be woken up ++ * @nr_wake: Number of futexes waiting in uaddr to be woken up ++ * @flags: Flags for size and shared ++ * ++ * Wake `nr_wake` threads waiting at uaddr. ++ * ++ * Returns the number of woken threads on success, error code otherwise. ++ */ ++SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, ++ unsigned int, flags) ++{ ++ return ksys_futex_wake(uaddr, nr_wake, flags); ++} ++ + static void futex_double_unlock(struct futex_bucket *b1, struct futex_bucket *b2) + { + spin_unlock(&b1->lock); +-- +2.31.1 + + +From 015b8cacf01907cdedfb46522908c3a8ab482bd6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Fri, 5 Feb 2021 10:34:02 -0300 +Subject: [PATCH 14/14] futex2: Add sysfs entry for syscall numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -2523,20 +4214,19 @@ process. Expose futex2 syscall number via sysfs so tools that are experimenting with futex2 (like Proton/Wine) can test it and set the syscall number at runtime, rather than setting it at compilation time. -Signed-off-by: André Almeida +Signed-off-by: André Almeida --- kernel/futex2.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/kernel/futex2.c b/kernel/futex2.c -index e5afb5faf98d..3a26f80fd95e 100644 +index 25f5dea49ad7..a7f132bb061d 100644 --- a/kernel/futex2.c +++ b/kernel/futex2.c -@@ -427,3 +427,45 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, - - return futex_wake(uaddr, futex_flags, nr_wake, FUTEX_BITSET_MATCH_ANY); +@@ -1224,6 +1224,48 @@ SYSCALL_DEFINE6(futex_requeue, struct futex_requeue __user *, uaddr1, + return __futex_requeue(rq1, rq2, nr_wake, nr_requeue, cmpval, shared1, shared2); } -+ + +static ssize_t wait_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ @@ -2578,47 +4268,10 @@ index e5afb5faf98d..3a26f80fd95e 100644 + return sysfs_create_group(kernel_kobj, &futex2_sysfs_attr_group); +} +subsys_initcall(futex2_sysfs_init); ++ + static int __init futex2_init(void) + { + int i; -- -2.33.0 - -From 1d8ed8c38196b0cbed555c1b624d3a0205a59a53 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Fri, 25 Jun 2021 18:52:32 -0300 -Subject: [PATCH 10/10] futex2: proton - ---- - include/linux/compat.h | 2 +- - include/uapi/linux/futex.h | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/include/linux/compat.h b/include/linux/compat.h -index 6e3abdde1c86..bb59a7785919 100644 ---- a/include/linux/compat.h -+++ b/include/linux/compat.h -@@ -369,8 +369,8 @@ struct compat_robust_list_head { - }; - - struct compat_futex_waitv { -- compat_u64 val; - compat_uptr_t uaddr; -+ compat_uint_t val; - compat_uint_t flags; - }; - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index daa135bdedda..45691d51cc61 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -55,8 +55,8 @@ - * @flags: Flags for this waiter - */ - struct futex_waitv { -- __u64 val; - void __user *uaddr; -+ unsigned int val; - unsigned int flags; - }; - --- -2.33.0 +2.31.1