Re: [PATCH v2 4/4] arch, x86: cache management apis for persistent memory

From: Dan Williams
Date: Mon Jun 01 2015 - 12:22:37 EST


On Mon, Jun 1, 2015 at 4:39 AM, Boaz Harrosh <boaz@xxxxxxxxxxxxx> wrote:
> On 05/30/2015 09:59 PM, Dan Williams wrote:
>> From: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
>>
>> Based on an original patch by Ross Zwisler [1].
>>
>> Writes to persistent memory have the potential to be posted to cpu
>> cache, cpu write buffers, and platform write buffers (memory controller)
>> before being committed to persistent media. Provide apis
>> (persistent_copy() and persistent_sync()) to copy data and assert that
>> it is durable in PMEM (a persistent linear address range).
>>
>> [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html
>>
>> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
>> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
>> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
>> Signed-off-by: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
>> [djbw: s/arch_persistent_flush()/io_flush_cache_range()/]
>> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
>> ---
>> arch/x86/Kconfig | 1
>> arch/x86/include/asm/cacheflush.h | 24 ++++++++++
>> arch/x86/include/asm/io.h | 6 ++
>> drivers/block/pmem.c | 58 ++++++++++++++++++++++-
>> include/linux/pmem.h | 93 +++++++++++++++++++++++++++++++++++++
>> lib/Kconfig | 3 +
>> 6 files changed, 183 insertions(+), 2 deletions(-)
>> create mode 100644 include/linux/pmem.h
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index 73a4d0330ad0..6412d92e6f1e 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -102,6 +102,7 @@ config X86
>> select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>> select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
>> select ARCH_HAS_SG_CHAIN
>> + select ARCH_HAS_PMEM_API
>> select CLKEVT_I8253
>> select ARCH_HAVE_NMI_SAFE_CMPXCHG
>> select GENERIC_IOMAP
>> diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
>> index b6f7457d12e4..6b8bd5c43bf6 100644
>> --- a/arch/x86/include/asm/cacheflush.h
>> +++ b/arch/x86/include/asm/cacheflush.h
>> @@ -4,6 +4,7 @@
>> /* Caches aren't brain-dead on the intel. */
>> #include <asm-generic/cacheflush.h>
>> #include <asm/special_insns.h>
>> +#include <asm/uaccess.h>
>>
>> /*
>> * The set_memory_* API can be used to change various attributes of a virtual
>> @@ -108,4 +109,27 @@ static inline int rodata_test(void)
>> }
>> #endif
>>
>> +static inline void arch_persistent_copy(void *dst, const void *src, size_t n)
>> +{
>> + /*
>> + * We are copying between two kernel buffers, if
>> + * __copy_from_user_inatomic_nocache() returns an error (page
>> + * fault) we would have already taken an unhandled fault before
>> + * the BUG_ON. The BUG_ON is simply here to satisfy
>> + * __must_check and allow reuse of the common non-temporal store
>> + * implementation for persistent_copy().
>> + */
>> + BUG_ON(__copy_from_user_inatomic_nocache(dst, src, n));
>> +}
>> +
>> +static inline void arch_persistent_sync(void)
>> +{
>> + wmb();
>> + pcommit_sfence();
>> +}
>> +
>> +static inline bool __arch_has_persistent_sync(void)
>> +{
>> + return boot_cpu_has(X86_FEATURE_PCOMMIT);
>> +}
>> #endif /* _ASM_X86_CACHEFLUSH_H */
>> diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
>> index 956f2768bdc1..f3c32bb207cf 100644
>> --- a/arch/x86/include/asm/io.h
>> +++ b/arch/x86/include/asm/io.h
>> @@ -250,6 +250,12 @@ static inline void flush_write_buffers(void)
>> #endif
>> }
>>
>> +static inline void *arch_persistent_remap(resource_size_t offset,
>> + unsigned long size)
>> +{
>> + return (void __force *) ioremap_cache(offset, size);
>> +}
>> +
>> #endif /* __KERNEL__ */
>>
>> extern void native_io_delay(void);
>> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
>> index 799acff6bd7c..10cbe557165c 100644
>> --- a/drivers/block/pmem.c
>> +++ b/drivers/block/pmem.c
>> @@ -23,9 +23,16 @@
>> #include <linux/module.h>
>> #include <linux/moduleparam.h>
>> #include <linux/slab.h>
>> +#include <linux/pmem.h>
>>
>> #define PMEM_MINORS 16
>>
>> +struct pmem_ops {
>> + void *(*remap)(resource_size_t offset, unsigned long size);
>> + void (*copy)(void *dst, const void *src, size_t size);
>> + void (*sync)(void);
>
> What? why the ops at all see below ...
>
>> +};
>> +
>> struct pmem_device {
>> struct request_queue *pmem_queue;
>> struct gendisk *pmem_disk;
>> @@ -34,11 +41,54 @@ struct pmem_device {
>> phys_addr_t phys_addr;
>> void *virt_addr;
>> size_t size;
>> + struct pmem_ops ops;
>> };
>>
>> static int pmem_major;
>> static atomic_t pmem_index;
>>
>> +static void default_pmem_sync(void)
>> +{
>> + wmb();
>> +}
>> +
>> +static void default_pmem_copy(void *dst, const void *src, size_t size)
>> +{
>> + memcpy(dst, src, size);
>> +}
>> +
>> +static void pmem_ops_default_init(struct pmem_device *pmem)
>> +{
>> + /*
>> + * These defaults seek to offer decent performance and minimize
>> + * the window between i/o completion and writes being durable on
>> + * media. However, it is undefined / architecture specific
>> + * whether acknowledged data may be lost in transit if a power
>> + * fail occurs after bio_endio().
>> + */
>> + pmem->ops.remap = memremap_wt;
>> + pmem->ops.copy = default_pmem_copy;
>> + pmem->ops.sync = default_pmem_sync;
>> +}
>> +
>> +static bool pmem_ops_init(struct pmem_device *pmem)
>> +{
>> + if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) &&
>> + arch_has_persistent_sync()) {
>
> I must be slow and stupid but it looks to me like:
>
> if arch_has_persistent_sync == false then persistent_sync
> can then just be the default above wmb, and
> if (something_always_off())
> do
> Will be always faster then a function pointer. This
> if can be in the generic implementation of persistent_sync
> and be done with.

A few things... that "something_always_off()" check is a cpuid call on
x86. There's no need to keep asking we can resolve the answer once at
init. If you can get this indirect branch to show up in a meaningful
way on a profile I'd reconsider, but branch-target-buffers are fairly
good these days. The wmb() is neither necessary nor sufficient on x86
and it's simply a guess that it helps on other archs().

> Then persistent_copy() can just have an inline generic
> implementation of your memcpy above, and the WARN_ON_ONCE
> or what ever you want.

No, persistent_copy() does not have the proper context to tell you
which device is encountering the problem. And again, we'd be taking a
conditional branch every i/o for this determination that could be made
once at init.

> And no need for any opt vector and function pointers call out.

There's a third case to consider, driver local persistence guarantees.
ADR platforms need not call pcommit_sfence() to guarantee persistence.
In that case I would expect ADR platform enabling developers to extend
pmem_ops_init() to turn off the warning with ops that are specific to
their platform. We also have directed flush support through registers
that could be turned into another op option.

> And also for me the all arch_has_persistent_sync() is mute,
> the arches that have members of its family not support some
> fixture can do the if (always_false_or_true) thing and do
> not pollute the global name space. All we need is a single
> switch as above
> #ifdef CONFIG_ARCH_HAS_PMEM_API
> => arch_persistent_sync
> #else
> => wmb

wmb() is not a substitute for pcommit, I think it is dangerous to
imply that you can silently fall back to wmb().

> #endif
>
>> + /*
>> + * This arch + cpu guarantees that bio_endio() == data
>> + * durable on media.
>> + */
>> + pmem->ops.remap = persistent_remap;
>> + pmem->ops.copy = persistent_copy;
>> + pmem->ops.sync = persistent_sync;
>> + return true;
>> + }
>> +
>> + pmem_ops_default_init(pmem);
>> + return false;
>> +}
>> +
>> static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
>> unsigned int len, unsigned int off, int rw,
>> sector_t sector)
>> @@ -51,7 +101,7 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
>> flush_dcache_page(page);
>> } else {
>> flush_dcache_page(page);
>> - memcpy(pmem->virt_addr + pmem_off, mem + off, len);
>> + pmem->ops.copy(pmem->virt_addr + pmem_off, mem + off, len);
>> }
>>
>> kunmap_atomic(mem);
>> @@ -82,6 +132,8 @@ static void pmem_make_request(struct request_queue *q, struct bio *bio)
>> sector += bvec.bv_len >> 9;
>> }
>>
>> + if (rw)
>> + pmem->ops.sync();
>> out:
>> bio_endio(bio, err);
>> }
>> @@ -131,6 +183,8 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
>>
>> pmem->phys_addr = res->start;
>> pmem->size = resource_size(res);
>> + if (!pmem_ops_init(pmem))
>
> fine just #ifndef CONFIG_ARCH_HAS_PMEM_API

No because the presence of the API is not sufficient to determine
whether persistence durability can be guaranteed.

>> + dev_warn(dev, "unable to guarantee persistence of writes\n");
>
> But I wouldn't even bother I'd just put a WARN_ON_ONCE inside
> persistent_copy() on any real use pmem or not and be done with it.

Again, we can make this determination once, and not forever for every
future usage.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/