Re: [RFC 7/8] Enhance ramfs to support higher order pages

From: Christoph Lameter
Date: Fri Apr 20 2007 - 14:01:34 EST


Variable Order Page Cache: mmap_nopage and mmap_populate

Fix up both functions to be able to operate on arbitrary order
pages. However, both functions establish page table entries
in PAGE_SIZE only and the offset and pgoffset when calling
both functions is always in PAGE_SIZE units. Thus the parameters
were renamed to pgoff_page which is in PAGE_SIZE unites in
constrast to pgoff which is in the order prescribed by the
address space.

As a result both functions may handle a page struct pointer to
a tail page. That is the page to be mapped or that was mapped.
However, that page struct cannot be used to get a refcount
or mark page characteristics. This can only be done on the
head page!

We need to fixup install_page also since filemap_populate
relies on it.

[WARNING: Early early draft may not compile untested]

---
mm/filemap.c | 38 ++++++++++++++++++++++++++++----------
mm/fremap.c | 17 +++++++++++------
2 files changed, 39 insertions(+), 16 deletions(-)

Index: linux-2.6.21-rc7/mm/filemap.c
===================================================================
--- linux-2.6.21-rc7.orig/mm/filemap.c 2007-04-19 21:26:16.000000000 -0700
+++ linux-2.6.21-rc7/mm/filemap.c 2007-04-19 21:27:55.000000000 -0700
@@ -1318,6 +1318,12 @@ static int fastcall page_cache_read(stru
* The goto's are kind of ugly, but this streamlines the normal case of having
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
+ *
+ * filemap_nopage returns pointer to a page that may be a tail page
+ * of a compound page suitable for the VM to map a PAGE_SIZE portion.
+ * However, the VM must update state information in the head page
+ * alone. F.e. Taking a refcount on a tail page does not have the
+ * intended effect.
*/
struct page *filemap_nopage(struct vm_area_struct *area,
unsigned long address, int *type)
@@ -1328,13 +1334,15 @@ struct page *filemap_nopage(struct vm_ar
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
struct page *page;
- unsigned long size, pgoff;
+ unsigned long size, pgoff, pgoff_page, compound_index;
int did_readaround = 0, majmin = VM_FAULT_MINOR;

- pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+ pgoff_page = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+ pgoff = pgoff_page >> mapping->order;
+ compound_index = pg_off_page % (1 << mapping->order);

retry_all:
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size = (i_size_read(inode) + page_cache_size(mapping) - 1) >> page_cache_shift(mapping);
if (pgoff >= size)
goto outside_data_content;

@@ -1412,7 +1420,7 @@ success:
mark_page_accessed(page);
if (type)
*type = majmin;
- return page;
+ return page + compound_index;

outside_data_content:
/*
@@ -1637,8 +1645,12 @@ err:
return NULL;
}

+/*
+ * filemap_populate installs page sized ptes in the indicated area.
+ * However, the underlying pages may be of higher order.
+ */
int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
- unsigned long len, pgprot_t prot, unsigned long pgoff,
+ unsigned long len, pgprot_t prot, unsigned long pgoff_page,
int nonblock)
{
struct file *file = vma->vm_file;
@@ -1648,14 +1660,20 @@ int filemap_populate(struct vm_area_stru
struct mm_struct *mm = vma->vm_mm;
struct page *page;
int err;
+ unsigned long pgoff;
+ int compound_index;

if (!nonblock)
force_page_cache_readahead(mapping, vma->vm_file,
- pgoff, len >> PAGE_CACHE_SHIFT);
+ pgoff_page >> mapping->order,
+ len >> page_cache_shift(mapping));

repeat:
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
+ pgoff = pgoff_page >> mapping->order;
+ compound_index = pgoff_page % (1 << mapping->order);
+
+ size = (i_size_read(inode) + page_cache_size(mapping) - 1) >> page_cache_shift(mapping);
+ if (pgoff + (len >> page_cache_shift(mapping)) > size)
return -EINVAL;

page = filemap_getpage(file, pgoff, nonblock);
@@ -1666,7 +1684,7 @@ repeat:
return -ENOMEM;

if (page) {
- err = install_page(mm, vma, addr, page, prot);
+ err = install_page(mm, vma, addr, page + compound_index, prot);
if (err) {
page_cache_release(page);
return err;
@@ -1682,7 +1700,7 @@ repeat:

len -= PAGE_SIZE;
addr += PAGE_SIZE;
- pgoff++;
+ pgoff_page++;
if (len)
goto repeat;

Index: linux-2.6.21-rc7/mm/fremap.c
===================================================================
--- linux-2.6.21-rc7.orig/mm/fremap.c 2007-04-19 21:33:34.000000000 -0700
+++ linux-2.6.21-rc7/mm/fremap.c 2007-04-19 21:37:30.000000000 -0700
@@ -46,7 +46,9 @@ static int zap_pte(struct mm_struct *mm,

/*
* Install a file page to a given virtual memory address, release any
- * previously existing mapping.
+ * previously existing mapping. The page may point to a tail page
+ * in which case we update the state in the head page but establish
+ * a PAGE_SIZEd mapping to the tail page alone.
*/
int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, struct page *page, pgprot_t prot)
@@ -57,6 +59,8 @@ int install_page(struct mm_struct *mm, s
pte_t *pte;
pte_t pte_val;
spinlock_t *ptl;
+ struct address_space *mapping;
+ struct head_page *page = compound_head(page);

pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
@@ -67,12 +71,13 @@ int install_page(struct mm_struct *mm, s
* caller about it.
*/
err = -EINVAL;
- inode = vma->vm_file->f_mapping->host;
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (!page->mapping || page->index >= size)
+ mapping = vma->vm_file->f_mapping;
+ inode = mapping->host;
+ size = (i_size_read(inode) + page_cache_size(mapping) - 1) >> page_cache_shift(mapping);
+ if (!head_page->mapping || head_page->index >= size)
goto unlock;
err = -ENOMEM;
- if (page_mapcount(page) > INT_MAX/2)
+ if (page_mapcount(head_page) > INT_MAX/2)
goto unlock;

if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
@@ -81,7 +86,7 @@ int install_page(struct mm_struct *mm, s
flush_icache_page(vma, page);
pte_val = mk_pte(page, prot);
set_pte_at(mm, addr, pte, pte_val);
- page_add_file_rmap(page);
+ page_add_file_rmap(head_page);
update_mmu_cache(vma, addr, pte_val);
lazy_mmu_prot_update(pte_val);
err = 0;

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/