Re: [PATCH v2 01/21] scatterlist: Introduce sg_map helper functions

From: Logan Gunthorpe
Date: Thu Apr 27 2017 - 16:14:45 EST



On 26/04/17 01:44 AM, Christoph Hellwig wrote:
> I think we'll at least need a draft of those to make sense of these
> patches. Otherwise they just look very clumsy.

Ok, what follows is a draft patch attempting to show where I'm thinking
of going with this. Obviously it will not compile because it assumes
the users throughout the kernel are a bit different than they are today.
Notably, there is no sg_page anymore.

There's also likely a ton of issues and arguments to have over a bunch
of the specifics below and I'd expect the concept to evolve more
as cleanup occurs. This itself is an evolution of the draft I posted
replying to you in my last RFC thread.

Also, before any of this is truly useful to us, pfn_t would have to
infect a few other places in the kernel.

Thanks,

Logan


diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index fad170b..85ef928 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -6,13 +6,14 @@
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/pfn_t.h>
#include <asm/io.h>

struct scatterlist {
#ifdef CONFIG_DEBUG_SG
unsigned long sg_magic;
#endif
- unsigned long page_link;
+ pfn_t pfn;
unsigned int offset;
unsigned int length;
dma_addr_t dma_address;
@@ -60,15 +61,68 @@ struct sg_table {

#define SG_MAGIC 0x87654321

-/*
- * We overload the LSB of the page pointer to indicate whether it's
- * a valid sg entry, or whether it points to the start of a new
scatterlist.
- * Those low bits are there for everyone! (thanks mason :-)
- */
-#define sg_is_chain(sg) ((sg)->page_link & 0x01)
-#define sg_is_last(sg) ((sg)->page_link & 0x02)
-#define sg_chain_ptr(sg) \
- ((struct scatterlist *) ((sg)->page_link & ~0x03))
+static inline bool sg_is_chain(struct scatterlist *sg)
+{
+ return sg->pfn.val & PFN_SG_CHAIN;
+}
+
+static inline bool sg_is_last(struct scatterlist *sg)
+{
+ return sg->pfn.val & PFN_SG_LAST;
+}
+
+static inline struct scatterlist *sg_chain_ptr(struct scatterlist *sg)
+{
+ unsigned long sgl = pfn_t_to_pfn(sg->pfn);
+ return (struct scatterlist *)(sgl << PAGE_SHIFT);
+}
+
+static inline bool sg_is_iomem(struct scatterlist *sg)
+{
+ return pfn_t_is_iomem(sg->pfn);
+}
+
+/**
+ * sg_assign_pfn - Assign a given pfn_t to an SG entry
+ * @sg: SG entry
+ * @pfn: The pfn
+ *
+ * Description:
+ * Assign a pfn to sg entry. Also see sg_set_pfn(), the most commonly
used
+ * variant.w
+ *
+ **/
+static inline void sg_assign_pfn(struct scatterlist *sg, pfn_t pfn)
+{
+#ifdef CONFIG_DEBUG_SG
+ BUG_ON(sg->sg_magic != SG_MAGIC);
+ BUG_ON(sg_is_chain(sg));
+ BUG_ON(pfn.val & (PFN_SG_CHAIN | PFN_SG_LAST));
+#endif
+
+ sg->pfn = pfn;
+}
+
+/**
+ * sg_set_pfn - Set sg entry to point at given pfn
+ * @sg: SG entry
+ * @pfn: The page
+ * @len: Length of data
+ * @offset: Offset into page
+ *
+ * Description:
+ * Use this function to set an sg entry pointing at a pfn, never assign
+ * the page directly. We encode sg table information in the lower bits
+ * of the page pointer. See sg_pfn_t for looking up the pfn_t belonging
+ * to an sg entry.
+ **/
+static inline void sg_set_pfn(struct scatterlist *sg, pfn_t pfn,
+ unsigned int len, unsigned int offset)
+{
+ sg_assign_pfn(sg, pfn);
+ sg->offset = offset;
+ sg->length = len;
+}

/**
* sg_assign_page - Assign a given page to an SG entry
@@ -82,18 +136,13 @@ struct sg_table {
**/
static inline void sg_assign_page(struct scatterlist *sg, struct page
*page)
{
- unsigned long page_link = sg->page_link & 0x3;
+ if (!page) {
+ pfn_t null_pfn = {0};
+ sg_assign_pfn(sg, null_pfn);
+ return;
+ }

- /*
- * In order for the low bit stealing approach to work, pages
- * must be aligned at a 32-bit boundary as a minimum.
- */
- BUG_ON((unsigned long) page & 0x03);
-#ifdef CONFIG_DEBUG_SG
- BUG_ON(sg->sg_magic != SG_MAGIC);
- BUG_ON(sg_is_chain(sg));
-#endif
- sg->page_link = page_link | (unsigned long) page;
+ sg_assign_pfn(sg, page_to_pfn_t(page));
}

/**
@@ -106,8 +155,7 @@ static inline void sg_assign_page(struct scatterlist
*sg, struct page *page)
* Description:
* Use this function to set an sg entry pointing at a page, never assign
* the page directly. We encode sg table information in the lower bits
- * of the page pointer. See sg_page() for looking up the page belonging
- * to an sg entry.
+ * of the page pointer.
*
**/
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
@@ -118,13 +166,53 @@ static inline void sg_set_page(struct scatterlist
*sg, struct page *page,
sg->length = len;
}

-static inline struct page *sg_page(struct scatterlist *sg)
+/**
+ * sg_pfn_t - Return the pfn_t for the sg
+ * @sg: SG entry
+ *
+ **/
+static inline pfn_t sg_pfn_t(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
BUG_ON(sg_is_chain(sg));
#endif
- return (struct page *)((sg)->page_link & ~0x3);
+
+ return sg->pfn;
+}
+
+/**
+ * sg_to_mappable_page - Try to return a struct page safe for general
+ * use in the kernel
+ * @sg: SG entry
+ * @page: A pointer to the returned page
+ *
+ * Description:
+ * If possible, return a mappable page that's safe for use around the
+ * kernel. Should only be used in legacy situations. sg_pfn_t() is a
+ * better choice for new code. This is deliberately more awkward than
+ * the old sg_page to enforce the __must_check rule and discourage future
+ * use.
+ *
+ * An example where this is required is in nvme-fabrics: a page from an
+ * sgl is placed into a bio. This function would be required until we can
+ * convert bios to use pfn_t as well. Similar issues with skbs, etc.
+ **/
+static inline __must_check int sg_to_mappable_page(struct scatterlist *sg,
+ struct page **ret)
+{
+ struct page *pg;
+
+ if (unlikely(sg_is_iomem(sg)))
+ return -EFAULT;
+
+ pg = pfn_t_to_page(sg->pfn);
+ if (unlikely(!pg))
+ return -EFAULT;
+
+ *ret = pg;
+
+ return 0;
}

#define SG_KMAP (1 << 0) /* create a mapping with kmap */
@@ -167,8 +255,19 @@ static inline void *sg_map(struct scatterlist *sg,
size_t offset, int flags)
unsigned int pg_off;
void *ret;

+ if (unlikely(sg_is_iomem(sg))) {
+ ret = ERR_PTR(-EFAULT);
+ goto out;
+ }
+
+ pg = pfn_t_to_page(sg->pfn);
+ if (unlikely(!pg)) {
+ ret = ERR_PTR(-EFAULT);
+ goto out;
+ }
+
offset += sg->offset;
- pg = nth_page(sg_page(sg), offset >> PAGE_SHIFT);
+ pg = nth_page(pg, offset >> PAGE_SHIFT);
pg_off = offset_in_page(offset);

if (flags & SG_KMAP_ATOMIC)
@@ -178,12 +277,7 @@ static inline void *sg_map(struct scatterlist *sg,
size_t offset, int flags)
else
ret = ERR_PTR(-EINVAL);

- /*
- * In theory, this can't happen yet. Once we start adding
- * unmapable memory, it also shouldn't happen unless developers
- * start putting unmappable struct pages in sgls and passing
- * it to code that doesn't support it.
- */
+out:
BUG_ON(flags & SG_MAP_MUST_NOT_FAIL && IS_ERR(ret));

return ret;
@@ -202,9 +296,15 @@ static inline void *sg_map(struct scatterlist *sg,
size_t offset, int flags)
static inline void sg_unmap(struct scatterlist *sg, void *addr,
size_t offset, int flags)
{
- struct page *pg = nth_page(sg_page(sg), offset >> PAGE_SHIFT);
+ struct page *pg;
unsigned int pg_off = offset_in_page(offset);

+ pg = pfn_t_to_page(sg->pfn);
+ if (unlikely(!pg))
+ return;
+
+ pg = nth_page(pg, offset >> PAGE_SHIFT);
+
if (flags & SG_KMAP_ATOMIC)
kunmap_atomic(addr - sg->offset - pg_off);
else if (flags & SG_KMAP)
@@ -246,17 +346,18 @@ static inline void sg_set_buf(struct scatterlist
*sg, const void *buf,
static inline void sg_chain(struct scatterlist *prv, unsigned int
prv_nents,
struct scatterlist *sgl)
{
+ pfn_t pfn;
+ unsigned long _sgl = (unsigned long) sgl;
+
/*
* offset and length are unused for chain entry. Clear them.
*/
prv[prv_nents - 1].offset = 0;
prv[prv_nents - 1].length = 0;

- /*
- * Set lowest bit to indicate a link pointer, and make sure to clear
- * the termination bit if it happens to be set.
- */
- prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02;
+ BUG_ON(_sgl & PAGE_MASK);
+ pfn = __pfn_to_pfn_t(_sgl >> PAGE_SHIFT, PFN_SG_CHAIN);
+ prv[prv_nents - 1].pfn = pfn;
}

/**
@@ -276,8 +377,8 @@ static inline void sg_mark_end(struct scatterlist *sg)
/*
* Set termination bit, clear potential chain bit
*/
- sg->page_link |= 0x02;
- sg->page_link &= ~0x01;
+ sg->pfn.val |= PFN_SG_LAST;
+ sg->pfn.val &= ~PFN_SG_CHAIN;
}

/**
@@ -293,7 +394,7 @@ static inline void sg_unmark_end(struct scatterlist *sg)
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
#endif
- sg->page_link &= ~0x02;
+ sg->pfn.val &= ~PFN_SG_LAST;
}

/**
@@ -301,14 +402,13 @@ static inline void sg_unmark_end(struct
scatterlist *sg)
* @sg: SG entry
*
* Description:
- * This calls page_to_phys() on the page in this sg entry, and adds the
- * sg offset. The caller must know that it is legal to call
page_to_phys()
- * on the sg page.
+ * This calls pfn_t_to_phys() on the pfn in this sg entry, and adds the
+ * sg offset.
*
**/
static inline dma_addr_t sg_phys(struct scatterlist *sg)
{
- return page_to_phys(sg_page(sg)) + sg->offset;
+ return pfn_t_to_phys(sg->pfn) + sg->offset;
}

/**
@@ -323,7 +423,12 @@ static inline dma_addr_t sg_phys(struct scatterlist
*sg)
**/
static inline void *sg_virt(struct scatterlist *sg)
{
- return page_address(sg_page(sg)) + sg->offset;
+ struct page *pg = pfn_t_to_page(sg->pfn);
+
+ BUG_ON(sg_is_iomem(sg));
+ BUG_ON(!pg);
+
+ return page_address(pg) + sg->offset;
}

int sg_nents(struct scatterlist *sg);
@@ -422,10 +527,18 @@ void __sg_page_iter_start(struct sg_page_iter *piter,
/**
* sg_page_iter_page - get the current page held by the page iterator
* @piter: page iterator holding the page
+ *
+ * This function will require some cleanup. Some users simply mark
+ * attributes of the pages which are fine, others actually map it and
+ * will require some saftey there.
*/
static inline struct page *sg_page_iter_page(struct sg_page_iter *piter)
{
- return nth_page(sg_page(piter->sg), piter->sg_pgoffset);
+ struct page *pg = pfn_t_to_page(piter->sg->pfn);
+ if (!pg)
+ return NULL;
+
+ return nth_page(pg, piter->sg_pgoffset);
}

/**
@@ -468,11 +581,13 @@ static inline dma_addr_t
sg_page_iter_dma_address(struct sg_page_iter *piter)
#define SG_MITER_ATOMIC (1 << 0) /* use kmap_atomic */
#define SG_MITER_TO_SG (1 << 1) /* flush back to phys on unmap */
#define SG_MITER_FROM_SG (1 << 2) /* nop */
+#define SG_MITER_SUPPORTS_IOMEM (1 << 3) /* iteratee supports
iomem */

struct sg_mapping_iter {
/* the following three fields can be accessed directly */
struct page *page; /* currently mapped page */
void *addr; /* pointer to the mapped area */
+ void __iomem *ioaddr; /* pointer to iomem */
size_t length; /* length of the mapped area */
size_t consumed; /* number of consumed bytes */
struct sg_page_iter piter; /* page iterator */
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index c6cf822..2d1c58c 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -571,6 +571,8 @@ EXPORT_SYMBOL(sg_miter_skip);
*/
bool sg_miter_next(struct sg_mapping_iter *miter)
{
+ void *addr;
+
sg_miter_stop(miter);

/*
@@ -580,13 +582,25 @@ bool sg_miter_next(struct sg_mapping_iter *miter)
if (!sg_miter_get_next_page(miter))
return false;

+ if (sg_is_iomem(miter->piter.sg) &&
+ !(miter->__flags & SG_MITER_SUPPORTS_IOMEM))
+ return false;
+
miter->page = sg_page_iter_page(&miter->piter);
miter->consumed = miter->length = miter->__remaining;

if (miter->__flags & SG_MITER_ATOMIC)
- miter->addr = kmap_atomic(miter->page) + miter->__offset;
+ addr = kmap_atomic(miter->page) + miter->__offset;
else
- miter->addr = kmap(miter->page) + miter->__offset;
+ addr = kmap(miter->page) + miter->__offset;
+
+ if (sg_is_iomem(miter->piter.sg)) {
+ miter->addr = NULL;
+ miter->ioaddr = (void * __iomem) addr;
+ } else {
+ miter->addr = addr;
+ miter->ioaddr = NULL;
+ }

return true;
}
@@ -651,7 +665,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl,
unsigned int nents, void *buf,
{
unsigned int offset = 0;
struct sg_mapping_iter miter;
- unsigned int sg_flags = SG_MITER_ATOMIC;
+ unsigned int sg_flags = SG_MITER_ATOMIC | SG_MITER_SUPPORTS_IOMEM;

if (to_buffer)
sg_flags |= SG_MITER_FROM_SG;
@@ -668,10 +682,17 @@ size_t sg_copy_buffer(struct scatterlist *sgl,
unsigned int nents, void *buf,

len = min(miter.length, buflen - offset);

- if (to_buffer)
- memcpy(buf + offset, miter.addr, len);
- else
- memcpy(miter.addr, buf + offset, len);
+ if (miter.addr) {
+ if (to_buffer)
+ memcpy(buf + offset, miter.addr, len);
+ else
+ memcpy(miter.addr, buf + offset, len);
+ } else if (miter.ioaddr) {
+ if (to_buffer)
+ memcpy_fromio(buf + offset, miter.addr, len);
+ else
+ memcpy_toio(miter.addr, buf + offset, len);
+ }

offset += len;
}