Re: [PATCH v2] x86, uaccess: introduce copy_from_iter_wt for pmem / writethrough operations

From: Ross Zwisler
Date: Mon May 08 2017 - 16:32:31 EST


On Fri, Apr 28, 2017 at 12:39:12PM -0700, Dan Williams wrote:
> The pmem driver has a need to transfer data with a persistent memory
> destination and be able to rely on the fact that the destination writes
> are not cached. It is sufficient for the writes to be flushed to a
> cpu-store-buffer (non-temporal / "movnt" in x86 terms), as we expect
> userspace to call fsync() to ensure data-writes have reached a
> power-fail-safe zone in the platform. The fsync() triggers a REQ_FUA or
> REQ_FLUSH to the pmem driver which will turn around and fence previous
> writes with an "sfence".
>
> Implement a __copy_from_user_inatomic_wt, memcpy_page_wt, and memcpy_wt,
> that guarantee that the destination buffer is not dirty in the cpu cache
> on completion. The new copy_from_iter_wt and sub-routines will be used
> to replace the "pmem api" (include/linux/pmem.h +
> arch/x86/include/asm/pmem.h). The availability of copy_from_iter_wt()
> and memcpy_wt() are gated by the CONFIG_ARCH_HAS_UACCESS_WT config
> symbol, and fallback to copy_from_iter_nocache() and plain memcpy()
> otherwise.
>
> This is meant to satisfy the concern from Linus that if a driver wants
> to do something beyond the normal nocache semantics it should be
> something private to that driver [1], and Al's concern that anything
> uaccess related belongs with the rest of the uaccess code [2].
>
> [1]: https://lists.01.org/pipermail/linux-nvdimm/2017-January/008364.html
> [2]: https://lists.01.org/pipermail/linux-nvdimm/2017-April/009942.html
>
> Cc: <x86@xxxxxxxxxx>
> Cc: Jan Kara <jack@xxxxxxx>
> Cc: Jeff Moyer <jmoyer@xxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Christoph Hellwig <hch@xxxxxx>
> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
> Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
> Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
> ---
<>
> diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
> index c5504b9a472e..07ded30c7e89 100644
> --- a/arch/x86/include/asm/uaccess_64.h
> +++ b/arch/x86/include/asm/uaccess_64.h
> @@ -171,6 +171,10 @@ unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigne
> extern long __copy_user_nocache(void *dst, const void __user *src,
> unsigned size, int zerorest);
>
> +extern long __copy_user_wt(void *dst, const void __user *src, unsigned size);
> +extern void memcpy_page_wt(char *to, struct page *page, size_t offset,
> + size_t len);
> +
> static inline int
> __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
> unsigned size)
> @@ -179,6 +183,13 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
> return __copy_user_nocache(dst, src, size, 0);
> }
>
> +static inline int
> +__copy_from_user_inatomic_wt(void *dst, const void __user *src, unsigned size)
> +{
> + kasan_check_write(dst, size);
> + return __copy_user_wt(dst, src, size);
> +}
> +
> unsigned long
> copy_user_handle_tail(char *to, char *from, unsigned len);
>
> diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
> index 3b7c40a2e3e1..0aeff66a022f 100644
> --- a/arch/x86/lib/usercopy_64.c
> +++ b/arch/x86/lib/usercopy_64.c
> @@ -7,6 +7,7 @@
> */
> #include <linux/export.h>
> #include <linux/uaccess.h>
> +#include <linux/highmem.h>
>
> /*
> * Zero Userspace
> @@ -73,3 +74,130 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
> clac();
> return len;
> }
> +
> +#ifdef CONFIG_ARCH_HAS_UACCESS_WT
> +/**
> + * clean_cache_range - write back a cache range with CLWB
> + * @vaddr: virtual start address
> + * @size: number of bytes to write back
> + *
> + * Write back a cache range using the CLWB (cache line write back)
> + * instruction. Note that @size is internally rounded up to be cache
> + * line size aligned.
> + */
> +static void clean_cache_range(void *addr, size_t size)
> +{
> + u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
> + unsigned long clflush_mask = x86_clflush_size - 1;
> + void *vend = addr + size;
> + void *p;
> +
> + for (p = (void *)((unsigned long)addr & ~clflush_mask);
> + p < vend; p += x86_clflush_size)
> + clwb(p);
> +}
> +
> +long __copy_user_wt(void *dst, const void __user *src, unsigned size)
> +{
> + unsigned long flushed, dest = (unsigned long) dst;
> + long rc = __copy_user_nocache(dst, src, size, 0);
> +
> + /*
> + * __copy_user_nocache() uses non-temporal stores for the bulk
> + * of the transfer, but we need to manually flush if the
> + * transfer is unaligned. A cached memory copy is used when
> + * destination or size is not naturally aligned. That is:
> + * - Require 8-byte alignment when size is 8 bytes or larger.
> + * - Require 4-byte alignment when size is 4 bytes.
> + */
> + if (size < 8) {
> + if (!IS_ALIGNED(dest, 4) || size != 4)
> + clean_cache_range(dst, 1);
> + } else {
> + if (!IS_ALIGNED(dest, 8)) {
> + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
> + clean_cache_range(dst, 1);
> + }
> +
> + flushed = dest - (unsigned long) dst;
> + if (size > flushed && !IS_ALIGNED(size - flushed, 8))
> + clean_cache_range(dst + size - 1, 1);
> + }
> +
> + return rc;
> +}
> +
> +void memcpy_wt(void *_dst, const void *_src, size_t size)
> +{
> + unsigned long dest = (unsigned long) _dst;
> + unsigned long source = (unsigned long) _src;
> +
> + /* cache copy and flush to align dest */
> + if (!IS_ALIGNED(dest, 8)) {
> + unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
> +
> + memcpy((void *) dest, (void *) source, len);
> + clean_cache_range((void *) dest, len);
> + dest += len;
> + source += len;
> + size -= len;
> + if (!size)
> + return;
> + }
> +
> + /* 4x8 movnti loop */
> + while (size >= 32) {
> + asm("movq (%0), %%r8\n"
> + "movq 8(%0), %%r9\n"
> + "movq 16(%0), %%r10\n"
> + "movq 24(%0), %%r11\n"
> + "movnti %%r8, (%1)\n"
> + "movnti %%r9, 8(%1)\n"
> + "movnti %%r10, 16(%1)\n"
> + "movnti %%r11, 24(%1)\n"
> + :: "r" (source), "r" (dest)
> + : "memory", "r8", "r9", "r10", "r11");
> + dest += 32;
> + source += 32;
> + size -= 32;
> + }
> +
> + /* 1x8 movnti loop */
> + while (size >= 8) {
> + asm("movq (%0), %%r8\n"
> + "movnti %%r8, (%1)\n"
> + :: "r" (source), "r" (dest)
> + : "memory", "r8");
> + dest += 8;
> + source += 8;
> + size -= 8;
> + }
> +
> + /* 1x4 movnti loop */
> + while (size >= 4) {
> + asm("movl (%0), %%r8d\n"
> + "movnti %%r8d, (%1)\n"
> + :: "r" (source), "r" (dest)
> + : "memory", "r8");
> + dest += 4;
> + source += 4;
> + size -= 4;
> + }
> +
> + /* cache copy for remaining bytes */
> + if (size) {
> + memcpy((void *) dest, (void *) source, size);
> + clean_cache_range((void *) dest, size);
> + }
> +}
> +EXPORT_SYMBOL_GPL(memcpy_wt);

I took a pretty hard look at the changes in arch/x86/lib/usercopy_64.c, and
they look correct to me. The inline assembly for non-temporal copies mixed
with C for loop control is IMHO much easier to follow than the pure assembly
of __copy_user_nocache().

Reviewed-by: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>