Re: [Intel-gfx] [PATCH 3/3] Introduce & use new lightweight SGL iterators

From: Chris Wilson
Date: Thu May 19 2016 - 13:28:05 EST


On Tue, May 17, 2016 at 01:05:48PM +0100, Dave Gordon wrote:
> On 17/05/16 11:34, Tvrtko Ursulin wrote:
> >
> >On 16/05/16 16:19, Dave Gordon wrote:
> >>The existing for_each_sg_page() iterator is somewhat heavyweight, and is
> >>limiting i915 driver performance in a few benchmarks. So here we
> >>introduce somewhat lighter weight iterators, primarily for use with GEM
> >>objects or other case where we need only deal with whole aligned pages.
> >
> >Interesting idea, if for nothing then for eliminating the dreaded
> >st->nents of for_each_sg_page. :)
> >
> >Which benchmarks it improves and how much do you know?
>
> I know nothing :)
>
> But last time I posted some easy-to-use iterators, Chris Wilson said
> they didn't address his complaint, which was that the existing ones
> were too slow.

These aren't very good either... Compared to the sg iters I have:

gem:exec:fault:1MiB: -4.32%
gem:exec:fault:1MiB:forked: -5.66%
gem:exec:fault:16MiB: -13.33%
gem:exec:fault:16MiB:forked: -12.03%
gem:exec:fault:256MiB: -15.28%
gem:exec:fault:256MiB:forked: -16.98%

(I was really hoping to be able to drop a patch!)

Patch used for reference:

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 03b7c2e..d7c1431 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3787,4 +3787,56 @@ int remap_io_mapping(struct vm_area_struct *vma,
#define i915_gem_object_for_each_vma(vma, obj) \
list_for_each_entry_check(vma, &(obj)->vma_list, obj_link, &(obj)->base.dev->struct_mutex)

+struct sgt_iter {
+ struct scatterlist *sgp;
+ union {
+ unsigned long pfn;
+ unsigned long dma;
+ } ix;
+ unsigned int curr;
+ unsigned int max;
+};
+
+static inline struct sgt_iter
+__sgt_iter(struct scatterlist *sgl, bool dma)
+{
+ struct sgt_iter s = { .sgp = sgl };
+
+ if (sgl) {
+ s.max = s.curr = sgl->offset;
+ s.max += sgl->length;
+ if (dma)
+ s.ix.dma = sg_dma_address(sgl);
+ else
+ s.ix.pfn = page_to_pfn(sg_page(sgl));
+ }
+
+ return s;
+}
+
+/**
+ * for_each_sgt_dma - iterate over the DMA addresses of the given sg_table
+ * @__dmap: DMA address (output)
+ * @__iter: 'struct sgt_iter' (iterator state, internal)
+ * @__sgt: sg_table to iterate over (input)
+ */
+#define for_each_sgt_dma(__dmap, __iter, __sgt) \
+ for ((__iter) = __sgt_iter((__sgt)->sgl, true); \
+ ((__dmap) = (__iter).ix.dma + (__iter).curr); \
+ (((__iter).curr += PAGE_SIZE) < (__iter).max) || \
+ ((__iter) = __sgt_iter(sg_next((__iter).sgp), true), 0))
+
+/**
+ * for_each_sgt_page - iterate over the pages of the given sg_table
+ * @__pp: page pointer (output)
+ * @__iter: 'struct sgt_iter' (iterator state, internal)
+ * @__sgt: sg_table to iterate over (input)
+ */
+#define for_each_sgt_page(__pp, __iter, __sgt) \
+ for ((__iter) = __sgt_iter((__sgt)->sgl, false); \
+ ((__pp) = (__iter).ix.pfn == 0 ? NULL : \
+ pfn_to_page((__iter).ix.pfn + ((__iter).curr >> PAGE_SHIFT)));\
+ (((__iter).curr += PAGE_SIZE) < (__iter).max) || \
+ ((__iter) = __sgt_iter(sg_next((__iter).sgp), false), 0))
+
#endif
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 603895a..3fcb540 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1571,18 +1571,19 @@ static void gen6_ppgtt_insert_entries(struct i915_address_space *vm,
unsigned act_pt = first_entry / GEN6_PTES;
unsigned act_pte = first_entry % GEN6_PTES - 1;
u32 pte_encode = vm->pte_encode(0, cache_level, true, flags);
- struct st_iter iter;
+ struct sgt_iter iter;
gen6_pte_t *pt_vaddr;
+ dma_addr_t addr;

pt_vaddr = kmap_px(ppgtt, &ppgtt->pd.page_table[act_pt]);
- st_for_each_address(&iter, pages) {
+ for_each_sgt_dma(addr, iter, pages) {
if (++act_pte == GEN6_PTES) {
kunmap_px(pt_vaddr);
pt_vaddr = kmap_px(ppgtt,
&ppgtt->pd.page_table[++act_pt]);
act_pte = 0;
}
- pt_vaddr[act_pte] = pte_encode | GEN6_PTE_ADDR_ENCODE(iter.dma);
+ pt_vaddr[act_pte] = pte_encode | GEN6_PTE_ADDR_ENCODE(addr);
}
kunmap_px(pt_vaddr);
}

--
Chris Wilson, Intel Open Source Technology Centre