Re: [PATCH] futex: Support smaller futexes of one byte or two byte size.
From: Peter Zijlstra
Date: Fri Dec 06 2019 - 10:31:43 EST
On Wed, Dec 04, 2019 at 06:52:38PM -0500, Malte Skarupke wrote:
> Two reasons for adding this:
> 1. People often want small mutexes. Having mutexes that are only one byte
> in size was the first reason WebKit mentioned for re-implementing futexes
> in a library:
> https://webkit.org/blog/6161/locking-in-webkit/
They have the best locking namespace _ever_!! Most impressed with them
for getting away with that.
> I had to change where we store two flags that were previously stored in
> the low bits of the address, since the address can no longer be assumed
> to be aligned on four bytes. Luckily the high bits were free as long as
> PAGE_SIZE is never more than 1 gigabyte.
For now it seems unlikely to run with 1G base pages :-)
> The reason for only supporting u8 and u16 is that those were easy to
> support with the existing interface. 64 bit futexes would require bigger
> changes.
Might make sense to enumerate the issues you've found that stand in the
way of 64bit futexes. But yes, I can think of a few.
> @@ -467,7 +469,14 @@ static void drop_futex_key_refs(union futex_key *key)
>
> enum futex_access {
I prefer FUTEX_NONE, to match PROT_NONE.
> FUTEX_READ,
> - FUTEX_WRITE
> + FUTEX_WRITE,
> + /*
> + * for operations that only need the address and don't touch the
> + * memory, like FUTEX_WAKE or FUTEX_REQUEUE. (not FUTEX_CMP_REQUEUE)
> + * this will skip the size checks of the futex, allowing those
> + * operations to be used with futexes of any size.
> + */
> + FUTEX_NO_READ_WRITE
> };
>
> /**
> @@ -520,25 +529,37 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
> * lock_page() might sleep, the caller should not hold a spinlock.
> */
> static int
> -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
> +get_futex_key(u32 __user *uaddr, int flags, union futex_key *key, enum futex_access rw)
> {
> unsigned long address = (unsigned long)uaddr;
> struct mm_struct *mm = current->mm;
> struct page *page, *tail;
> struct address_space *mapping;
> - int err, ro = 0;
> + int err, fshared, ro = 0;
> +
> + fshared = flags & FLAGS_SHARED;
>
> /*
> * The futex address must be "naturally" aligned.
> */
> + if (flags & FLAGS_8_BITS || rw == FUTEX_NO_READ_WRITE) {
> + if (unlikely(!access_ok(uaddr, sizeof(u8))))
> + return -EFAULT;
> + } else if (flags & FLAGS_16_BITS) {
> + if (unlikely((address % sizeof(u16)) != 0))
> + return -EINVAL;
> + if (unlikely(!access_ok(uaddr, sizeof(u16))))
> + return -EFAULT;
> + } else {
> + if (unlikely((address % sizeof(u32)) != 0))
> + return -EINVAL;
> + if (unlikely(!access_ok(uaddr, sizeof(u32))))
> + return -EFAULT;
> + }
That might be better written like:
if (rw != FUTEX_NONE)
size = futex_size(flags);
else
size = 1;
if (unlikely((address % size) != 0))
return -EINVAL;
if (unlikely(!access_ok(uaddr, size)))
return -EFAULT;
> @@ -799,6 +820,48 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
> return ret ? -EFAULT : 0;
> }
>
> +static int
> +get_futex_value_locked_any_size(u32 *dest, u32 __user *from, int flags)
> +{
> + int ret;
> + u8 dest_8_bits;
> + u16 dest_16_bits;
> +
> + pagefault_disable();
> + if (flags & FLAGS_8_BITS) {
> + ret = __get_user(dest_8_bits, (u8 __user *)from);
> + *dest = dest_8_bits;
> + } else if (flags & FLAGS_16_BITS) {
> + ret = __get_user(dest_16_bits, (u16 __user *)from);
> + *dest = dest_16_bits;
> + } else {
> + ret = __get_user(*dest, from);
> + }
> + pagefault_enable();
> +
> + return ret ? -EFAULT : 0;
> +}
> +
> +static int get_futex_value_any_size(u32 *dest, u32 __user *from, int flags)
> +{
> + int ret;
> + u8 uval_8_bits;
> + u16 uval_16_bits;
> +
> + if (flags & FLAGS_8_BITS) {
> + ret = get_user(uval_8_bits, (u8 __user *)from);
> + if (ret == 0)
> + *dest = uval_8_bits;
> + } else if (flags & FLAGS_16_BITS) {
> + ret = get_user(uval_16_bits, (u16 __user *)from);
> + if (ret == 0)
> + *dest = uval_16_bits;
> + } else {
> + ret = get_user(*dest, from);
> + }
> + return ret;
> +}
Perhaps write that like:
static inline int __get_futex_value(u32 *dest, u32 __user *from, int flags)
{
u32 val = 0;
int ret;
switch (futex_size(flags)) {
case 1:
/*
* afaict __get_user() doesn't care about the type of
* the first argument
*/
ret = __get_user(val, (u8 __user *)from);
break;
case 2:
....
case 4:
}
if (!ret)
*dest = val;
return ret;
}
static inline int get_futex_value(...)
{
do uaccess check
return __get_futex_value();
}
static inline int get_futex_value_locked(...)
{
int ret;
pagefault_disable();
ret = __get_futex_value(...);
pagefault_enable();
return ret;
}
> /*
> * PI code:
> + if (op & FUTEX_ALL_SIZE_BITS) {
> + switch (cmd) {
> + case FUTEX_CMP_REQUEUE:
> + /*
> + * for cmp_requeue, only uaddr has to be of the size
> + * passed in the flags. uaddr2 can be of any size
> + */
> + case FUTEX_WAIT:
> + case FUTEX_WAIT_BITSET:
> + switch (op & FUTEX_ALL_SIZE_BITS) {
> + case FUTEX_SIZE_8_BITS:
> + flags |= FLAGS_8_BITS;
> + break;
> + case FUTEX_SIZE_16_BITS:
> + flags |= FLAGS_16_BITS;
> + break;
> + case FUTEX_SIZE_32_BITS:
> + break;
> + default:
> + /*
> + * if both bits are set we could silently treat
> + * that as a 32 bit futex, but we may want to
> + * use this pattern in the future to indicate a
> + * 64 bit futex or an arbitrarily sized futex.
> + * so we don't want code relying on this yet.
> + */
> + return -EINVAL;
> + }
> + break;
> + case FUTEX_WAKE:
> + case FUTEX_REQUEUE:
> + /*
> + * these instructions work with sized mutexes, but you
> + * don't need to pass the size. we could silently
> + * ignore the size argument, but the code won't verify
> + * that the correct size is used, so it's preferable
> + * to make that clear to the caller.
> + *
> + * for requeue the meaning would also be ambiguous: do
> + * both of them have to be the same size or not? they
> + * don't, and that's clearer when you just don't pass
> + * a size argument.
> + */
> + return -EINVAL;
Took me a while to figure out this relies on FUTEX_NONE to avoid the
alignment tests.
> + default:
> + /*
> + * all other cases are not supported yet
> + */
> + return -EINVAL;
> + }
> + }
>
> switch (cmd) {
> case FUTEX_LOCK_PI:
> --
> 2.17.1
>