[PATCH 1/2] dma-mapping: introduce relaxed version of dma sync

From: Cho KyongHo
Date: Tue Aug 18 2020 - 03:51:09 EST


Cache maintenance operations in the most of CPU architectures needs
memory barrier after the cache maintenance for the DMAs to view the
region of the memory correctly. The problem is that memory barrier is
very expensive and dma_[un]map_sg() and dma_sync_sg_for_{device|cpu}()
involves the memory barrier per every single cache sg entry. In some
CPU micro-architecture, a single memory barrier consumes more time than
cache clean on 4KiB. It becomes more serious if the number of CPU cores
are larger.
This patch introduces arch_sync_dma_for_device_relaxed() and
arch_sync_dma_for_cpu_relaxed() which do not involve memory barrier.
So the users called those functions require explicitly calling
arch_sync_barrier_for_device() and arch_sync_barrier_for_cpu(),
respectively to confirm the view of memory is consistent between the
CPUs and DMAs.

Signed-off-by: Cho KyongHo <pullip.cho@xxxxxxxxxxx>
---
drivers/iommu/dma-iommu.c | 6 +++--
include/linux/dma-direct.h | 29 +++++++++++++++++-----
include/linux/dma-noncoherent.h | 54 +++++++++++++++++++++++++++++++++++++++++
kernel/dma/Kconfig | 8 ++++++
kernel/dma/direct.c | 25 +++++++++++++++----
5 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 5141d49..4f9c9cb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -705,7 +705,8 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
return;

for_each_sg(sgl, sg, nelems, i)
- arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
+ arch_sync_dma_for_cpu_relaxed(sg_phys(sg), sg->length, dir);
+ arch_sync_barrier_for_cpu(dir);
}

static void iommu_dma_sync_sg_for_device(struct device *dev,
@@ -719,7 +720,8 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
return;

for_each_sg(sgl, sg, nelems, i)
- arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
+ arch_sync_dma_for_device_relaxed(sg_phys(sg), sg->length, dir);
+ arch_sync_barrier_for_device(dir);
}

static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 6e87225..f5b1fee 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -152,7 +152,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
}

-static inline dma_addr_t dma_direct_map_page(struct device *dev,
+static inline dma_addr_t __dma_direct_map_page(struct device *dev,
struct page *page, unsigned long offset, size_t size,
enum dma_data_direction dir, unsigned long attrs)
{
@@ -172,20 +172,37 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
return DMA_MAPPING_ERROR;
}

- if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- arch_sync_dma_for_device(phys, size, dir);
return dma_addr;
}

-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+static inline dma_addr_t dma_direct_map_page(struct device *dev,
+ struct page *page, unsigned long offset, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ dma_addr_t dma_addr = __dma_direct_map_page(dev, page, offset, size, dir, attrs);
+
+ if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+ !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ arch_sync_dma_for_device(page_to_phys(page) + offset, size, dir);
+
+ return dma_addr;
+}
+
+static inline void __dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
phys_addr_t phys = dma_to_phys(dev, addr);

+ if (unlikely(is_swiotlb_buffer(phys)))
+ swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+}
+
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);

- if (unlikely(is_swiotlb_buffer(phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+ __dma_direct_unmap_page(dev, addr, size, dir, attrs);
}
#endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index ca09a4e..0a31e6c 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -73,23 +73,77 @@ static inline void arch_dma_cache_sync(struct device *dev, void *vaddr,
#endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */

#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED
+void arch_sync_dma_for_device_relaxed(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir);
+
+static inline void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
+{
+ arch_sync_dma_for_device_relaxed(paddr, size, dir);
+ arch_sync_barrier_for_device(dir);
+}
+#else
+#define arch_sync_dma_for_device_relaxed arch_sync_dma_for_device
+
void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
enum dma_data_direction dir);
+
+static inline void arch_sync_barrier_for_device(enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED */
#else
+static inline void arch_sync_dma_for_device_relaxed(phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir)
+{
+}
+
static inline void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
enum dma_data_direction dir)
{
}
+
+static inline void arch_sync_barrier_for_device(enum dma_data_direction dir)
+{
+}
#endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE */

#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED
+void arch_sync_dma_for_cpu_relaxed(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir);
+
+static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
+{
+ arch_sync_dma_for_cpu_relaxed(paddr, size, dir);
+ arch_sync_barrier_for_cpu(dir);
+}
+#else
+#define arch_sync_dma_for_cpu_relaxed arch_sync_dma_for_cpu
+
void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
enum dma_data_direction dir);
+
+static inline void arch_sync_barrier_for_cpu(enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED */
#else
+static inline void arch_sync_dma_for_cpu_relaxed(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
+{
+}
+
static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
enum dma_data_direction dir)
{
}
+
+static inline void arch_sync_barrier_for_cpu(enum dma_data_direction dir)
+{
+}
#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */

#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 847a9d1..d6fe727f1 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -59,6 +59,14 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU
bool
select NEED_DMA_MAP_STATE

+config ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED
+ bool
+ select ARCH_HAS_SYNC_DMA_FOR_DEVICE
+
+config ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED
+ bool
+ select ARCH_HAS_SYNC_DMA_FOR_CPU
+
config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
bool

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index db6ef07a..52e5fd1 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -321,9 +321,12 @@ void dma_direct_sync_sg_for_device(struct device *dev,
dir, SYNC_FOR_DEVICE);

if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(paddr, sg->length,
+ arch_sync_dma_for_device_relaxed(paddr, sg->length,
dir);
}
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_barrier_for_device(dir);
}
#endif

@@ -340,15 +343,17 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));

if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_cpu(paddr, sg->length, dir);
+ arch_sync_dma_for_cpu_relaxed(paddr, sg->length, dir);

if (unlikely(is_swiotlb_buffer(paddr)))
swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
SYNC_FOR_CPU);
}

- if (!dev_is_dma_coherent(dev))
+ if (!dev_is_dma_coherent(dev)) {
+ arch_sync_barrier_for_cpu(dir);
arch_sync_dma_for_cpu_all();
+ }
}

void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
@@ -357,8 +362,11 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
struct scatterlist *sg;
int i;

+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir);
+
for_each_sg(sgl, sg, nents, i)
- dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
+ __dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
attrs);
}
#endif
@@ -370,13 +378,20 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
struct scatterlist *sg;

for_each_sg(sgl, sg, nents, i) {
- sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
+ sg->dma_address = __dma_direct_map_page(dev, sg_page(sg),
sg->offset, sg->length, dir, attrs);
if (sg->dma_address == DMA_MAPPING_ERROR)
goto out_unmap;
sg_dma_len(sg) = sg->length;
}

+ if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+ for_each_sg(sgl, sg, nents, i)
+ arch_sync_dma_for_device_relaxed(dma_to_phys(dev, sg_dma_address(sg)),
+ sg->length, dir);
+ arch_sync_barrier_for_device(dir);
+ }
+
return nents;

out_unmap:
--
2.7.4