Re: [PATCH v2 4/4] arch, x86: cache management apis for persistent memory

From: Boaz Harrosh
Date: Mon Jun 01 2015 - 07:39:52 EST


On 05/30/2015 09:59 PM, Dan Williams wrote:
> From: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
>
> Based on an original patch by Ross Zwisler [1].
>
> Writes to persistent memory have the potential to be posted to cpu
> cache, cpu write buffers, and platform write buffers (memory controller)
> before being committed to persistent media. Provide apis
> (persistent_copy() and persistent_sync()) to copy data and assert that
> it is durable in PMEM (a persistent linear address range).
>
> [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html
>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
> Signed-off-by: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
> [djbw: s/arch_persistent_flush()/io_flush_cache_range()/]
> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
> ---
> arch/x86/Kconfig | 1
> arch/x86/include/asm/cacheflush.h | 24 ++++++++++
> arch/x86/include/asm/io.h | 6 ++
> drivers/block/pmem.c | 58 ++++++++++++++++++++++-
> include/linux/pmem.h | 93 +++++++++++++++++++++++++++++++++++++
> lib/Kconfig | 3 +
> 6 files changed, 183 insertions(+), 2 deletions(-)
> create mode 100644 include/linux/pmem.h
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 73a4d0330ad0..6412d92e6f1e 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -102,6 +102,7 @@ config X86
> select HAVE_ARCH_TRANSPARENT_HUGEPAGE
> select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
> select ARCH_HAS_SG_CHAIN
> + select ARCH_HAS_PMEM_API
> select CLKEVT_I8253
> select ARCH_HAVE_NMI_SAFE_CMPXCHG
> select GENERIC_IOMAP
> diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
> index b6f7457d12e4..6b8bd5c43bf6 100644
> --- a/arch/x86/include/asm/cacheflush.h
> +++ b/arch/x86/include/asm/cacheflush.h
> @@ -4,6 +4,7 @@
> /* Caches aren't brain-dead on the intel. */
> #include <asm-generic/cacheflush.h>
> #include <asm/special_insns.h>
> +#include <asm/uaccess.h>
>
> /*
> * The set_memory_* API can be used to change various attributes of a virtual
> @@ -108,4 +109,27 @@ static inline int rodata_test(void)
> }
> #endif
>
> +static inline void arch_persistent_copy(void *dst, const void *src, size_t n)
> +{
> + /*
> + * We are copying between two kernel buffers, if
> + * __copy_from_user_inatomic_nocache() returns an error (page
> + * fault) we would have already taken an unhandled fault before
> + * the BUG_ON. The BUG_ON is simply here to satisfy
> + * __must_check and allow reuse of the common non-temporal store
> + * implementation for persistent_copy().
> + */
> + BUG_ON(__copy_from_user_inatomic_nocache(dst, src, n));
> +}
> +
> +static inline void arch_persistent_sync(void)
> +{
> + wmb();
> + pcommit_sfence();
> +}
> +
> +static inline bool __arch_has_persistent_sync(void)
> +{
> + return boot_cpu_has(X86_FEATURE_PCOMMIT);
> +}
> #endif /* _ASM_X86_CACHEFLUSH_H */
> diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
> index 956f2768bdc1..f3c32bb207cf 100644
> --- a/arch/x86/include/asm/io.h
> +++ b/arch/x86/include/asm/io.h
> @@ -250,6 +250,12 @@ static inline void flush_write_buffers(void)
> #endif
> }
>
> +static inline void *arch_persistent_remap(resource_size_t offset,
> + unsigned long size)
> +{
> + return (void __force *) ioremap_cache(offset, size);
> +}
> +
> #endif /* __KERNEL__ */
>
> extern void native_io_delay(void);
> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
> index 799acff6bd7c..10cbe557165c 100644
> --- a/drivers/block/pmem.c
> +++ b/drivers/block/pmem.c
> @@ -23,9 +23,16 @@
> #include <linux/module.h>
> #include <linux/moduleparam.h>
> #include <linux/slab.h>
> +#include <linux/pmem.h>
>
> #define PMEM_MINORS 16
>
> +struct pmem_ops {
> + void *(*remap)(resource_size_t offset, unsigned long size);
> + void (*copy)(void *dst, const void *src, size_t size);
> + void (*sync)(void);

What? why the ops at all see below ...

> +};
> +
> struct pmem_device {
> struct request_queue *pmem_queue;
> struct gendisk *pmem_disk;
> @@ -34,11 +41,54 @@ struct pmem_device {
> phys_addr_t phys_addr;
> void *virt_addr;
> size_t size;
> + struct pmem_ops ops;
> };
>
> static int pmem_major;
> static atomic_t pmem_index;
>
> +static void default_pmem_sync(void)
> +{
> + wmb();
> +}
> +
> +static void default_pmem_copy(void *dst, const void *src, size_t size)
> +{
> + memcpy(dst, src, size);
> +}
> +
> +static void pmem_ops_default_init(struct pmem_device *pmem)
> +{
> + /*
> + * These defaults seek to offer decent performance and minimize
> + * the window between i/o completion and writes being durable on
> + * media. However, it is undefined / architecture specific
> + * whether acknowledged data may be lost in transit if a power
> + * fail occurs after bio_endio().
> + */
> + pmem->ops.remap = memremap_wt;
> + pmem->ops.copy = default_pmem_copy;
> + pmem->ops.sync = default_pmem_sync;
> +}
> +
> +static bool pmem_ops_init(struct pmem_device *pmem)
> +{
> + if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) &&
> + arch_has_persistent_sync()) {

I must be slow and stupid but it looks to me like:

if arch_has_persistent_sync == false then persistent_sync
can then just be the default above wmb, and
if (something_always_off())
do
Will be always faster then a function pointer. This
if can be in the generic implementation of persistent_sync
and be done with.

Then persistent_copy() can just have an inline generic
implementation of your memcpy above, and the WARN_ON_ONCE
or what ever you want.

And no need for any opt vector and function pointers call out.

And also for me the all arch_has_persistent_sync() is mute,
the arches that have members of its family not support some
fixture can do the if (always_false_or_true) thing and do
not pollute the global name space. All we need is a single
switch as above
#ifdef CONFIG_ARCH_HAS_PMEM_API
=> arch_persistent_sync
#else
=> wmb
#endif

> + /*
> + * This arch + cpu guarantees that bio_endio() == data
> + * durable on media.
> + */
> + pmem->ops.remap = persistent_remap;
> + pmem->ops.copy = persistent_copy;
> + pmem->ops.sync = persistent_sync;
> + return true;
> + }
> +
> + pmem_ops_default_init(pmem);
> + return false;
> +}
> +
> static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
> unsigned int len, unsigned int off, int rw,
> sector_t sector)
> @@ -51,7 +101,7 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
> flush_dcache_page(page);
> } else {
> flush_dcache_page(page);
> - memcpy(pmem->virt_addr + pmem_off, mem + off, len);
> + pmem->ops.copy(pmem->virt_addr + pmem_off, mem + off, len);
> }
>
> kunmap_atomic(mem);
> @@ -82,6 +132,8 @@ static void pmem_make_request(struct request_queue *q, struct bio *bio)
> sector += bvec.bv_len >> 9;
> }
>
> + if (rw)
> + pmem->ops.sync();
> out:
> bio_endio(bio, err);
> }
> @@ -131,6 +183,8 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
>
> pmem->phys_addr = res->start;
> pmem->size = resource_size(res);
> + if (!pmem_ops_init(pmem))

fine just #ifndef CONFIG_ARCH_HAS_PMEM_API

> + dev_warn(dev, "unable to guarantee persistence of writes\n");

But I wouldn't even bother I'd just put a WARN_ON_ONCE inside
persistent_copy() on any real use pmem or not and be done with it.

>
> err = -EINVAL;
> if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
> @@ -143,7 +197,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
> * of the CPU caches in case of a crash.
> */
> err = -ENOMEM;
> - pmem->virt_addr = memremap_wt(pmem->phys_addr, pmem->size);
> + pmem->virt_addr = pmem->ops.remap(pmem->phys_addr, pmem->size);
> if (!pmem->virt_addr)
> goto out_release_region;
>
> diff --git a/include/linux/pmem.h b/include/linux/pmem.h
> new file mode 100644
> index 000000000000..e9a63ee1d361
> --- /dev/null
> +++ b/include/linux/pmem.h
> @@ -0,0 +1,93 @@
> +/*
> + * Copyright(c) 2015 Intel Corporation. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of version 2 of the GNU General Public License as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + */
> +#ifndef __PMEM_H__
> +#define __PMEM_H__
> +
> +#include <linux/io.h>
> +#include <asm/cacheflush.h>
> +
> +/*
> + * Architectures that define ARCH_HAS_PMEM_API must provide
> + * implementations for arch_persistent_remap(), arch_persistent_copy(),
> + * arch_persistent_sync(), and __arch_has_persistent_sync().
> + */
> +
> +#ifdef CONFIG_ARCH_HAS_PMEM_API
> +/**
> + * persistent_remap - map physical persistent memory for pmem api
> + * @offset: physical address of persistent memory
> + * @size: size of the mapping
> + *
> + * Establish a mapping of the architecture specific memory type expected
> + * by persistent_copy() and persistent_sync(). For example, it may be
> + * the case that an uncacheable or writethrough mapping is sufficient,
> + * or a writeback mapping provided persistent_copy() and
> + * persistent_sync() arrange for the data to be written through the
> + * cache to persistent media.
> + */
> +static inline void *persistent_remap(resource_size_t offset, unsigned long size)
> +{
> + return arch_persistent_remap(offset, size);
> +}
> +
> +/**
> + * persistent_copy - copy data to persistent memory
> + * @dst: destination buffer for the copy
> + * @src: source buffer for the copy
> + * @n: length of the copy in bytes
> + *
> + * Perform a memory copy that results in the destination of the copy
> + * being effectively evicted from, or never written to, the processor
> + * cache hierarchy after the copy completes. After persistent_copy()
> + * data may still reside in cpu or platform buffers, so this operation
> + * must be followed by a persistent_sync().
> + */
> +static inline void persistent_copy(void *dst, const void *src, size_t n)
> +{
> + arch_persistent_copy(dst, src, n);
> +}
> +
> +/**
> + * persistent_sync - synchronize writes to persistent memory
> + *
> + * After a series of persistent_copy() operations this drains data from
> + * cpu write buffers and any platform (memory controller) buffers to
> + * ensure that written data is durable on persistent memory media.
> + */
> +static inline void persistent_sync(void)
> +{
> + arch_persistent_sync();
> +}
> +
> +/**
> + * arch_has_persistent_sync - true if persistent_sync() ensures durability
> + *
> + * For a given cpu implementation within an architecture it is possible
> + * that persistent_sync() resolves to a nop. In the case this returns
> + * false, pmem api users are unable to ensure durabilty and may want to
> + * fall back to a different data consistency model, or otherwise notify
> + * the user.
> + */
> +static inline bool arch_has_persistent_sync(void)

Again this can just go inside the arch in question.
Those arches without a choice need not bother

> +{
> + return __arch_has_persistent_sync();
> +}
> +#else
> +/* undefined symbols */
> +extern void *persistent_remap(resource_size_t offet, unsigned long size);
> +extern void persistent_copy(void *dst, const void *src, size_t n);
> +extern void persistent_sync(void);

Define these to the generic imp you have in pmem.c (memcpy && wmb)
After all drivers/block/pmem.c is just as generic as this here place

> +extern bool arch_has_persistent_sync(void);
> +#endif /* CONFIG_ARCH_HAS_PMEM_API */
> +
> +#endif /* __PMEM_H__ */
> diff --git a/lib/Kconfig b/lib/Kconfig
> index 601965a948e8..e6a3c892d514 100644
> --- a/lib/Kconfig
> +++ b/lib/Kconfig
> @@ -522,4 +522,7 @@ source "lib/fonts/Kconfig"
> config ARCH_HAS_SG_CHAIN
> def_bool n
>
> +config ARCH_HAS_PMEM_API
> + def_bool n
> +
> endmenu
>

Cheers
Boaz


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/