[RESEND][PATCH 4/6] Add page becoming writable notification

From: David Howells
Date: Thu Oct 14 2004 - 14:17:33 EST


Because Christoph asked so nicely, I'm resending this patch with a wider
distribution. The patch currently resides in the -mm kernels. It is being used
by the in-kernel AFS filesystem and by the NFS caching patches that haven't
yet been passed to akpm.

---

The attached patch creates a facility by which a filesystem, character device
or block device can detect a page mapped to an inode is about to become
writable. This provides the facility at two levels: a vma operation and an
address space operation. This can be used by a netfs to synchronise with a
cache writing the page to disc.

Signed-Off-By: David Howells <dhowells@xxxxxxxxxx>
---

include/linux/fs.h | 3 +
include/linux/mm.h | 4 ++
mm/filemap.c | 19 +++++++++-
mm/memory.c | 96 ++++++++++++++++++++++++++++++++++++++++-------------
4 files changed, 99 insertions(+), 23 deletions(-)

diff -uNrp linux-2.6.9-rc1-mm2/include/linux/fs.h linux-2.6.9-rc1-mm2-cachefs/include/linux/fs.h
--- linux-2.6.9-rc1-mm2/include/linux/fs.h 2004-08-31 16:52:38.000000000 +0100
+++ linux-2.6.9-rc1-mm2-cachefs/include/linux/fs.h 2004-09-03 16:42:33.229296834 +0100
@@ -328,6 +328,9 @@ struct address_space_operations {
int (*releasepage) (struct page *, int);
ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
+
+ /* notification that a page is about to become writable */
+ int (*page_mkwrite)(struct page *page);
};

struct backing_dev_info;
diff -uNrp linux-2.6.9-rc1-mm2/include/linux/mm.h linux-2.6.9-rc1-mm2-cachefs/include/linux/mm.h
--- linux-2.6.9-rc1-mm2/include/linux/mm.h 2004-08-31 16:52:38.000000000 +0100
+++ linux-2.6.9-rc1-mm2-cachefs/include/linux/mm.h 2004-09-03 16:47:46.047671396 +0100
@@ -175,6 +175,10 @@ struct vm_operations_struct {
void (*close)(struct vm_area_struct * area);
struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+
+ /* notification that a previously read-only page is about to become
+ * writable, if an error is returned it will cause a SIGBUS */
+ int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
#ifdef CONFIG_NUMA
int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
diff -uNrp linux-2.6.9-rc1-mm2/mm/filemap.c linux-2.6.9-rc1-mm2-cachefs/mm/filemap.c
--- linux-2.6.9-rc1-mm2/mm/filemap.c 2004-08-31 16:52:40.000000000 +0100
+++ linux-2.6.9-rc1-mm2-cachefs/mm/filemap.c 2004-09-03 16:46:58.780545691 +0100
@@ -1445,11 +1445,25 @@ repeat:
return 0;
}

+/*
+ * pass notification that a page is becoming writable up to the filesystem
+ */
+static int filemap_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ return page->mapping->a_ops->page_mkwrite(page);
+}
+
struct vm_operations_struct generic_file_vm_ops = {
.nopage = filemap_nopage,
.populate = filemap_populate,
};

+struct vm_operations_struct generic_file_vm_mkwr_ops = {
+ .nopage = filemap_nopage,
+ .populate = filemap_populate,
+ .page_mkwrite = filemap_page_mkwrite,
+};
+
/* This is used for a general mmap of a disk file */

int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -1459,7 +1473,10 @@ int generic_file_mmap(struct file * file
if (!mapping->a_ops->readpage)
return -ENOEXEC;
file_accessed(file);
- vma->vm_ops = &generic_file_vm_ops;
+ if (!mapping->a_ops->page_mkwrite)
+ vma->vm_ops = &generic_file_vm_ops;
+ else
+ vma->vm_ops = &generic_file_vm_mkwr_ops;
return 0;
}

diff -uNrp linux-2.6.9-rc1-mm2/mm/memory.c linux-2.6.9-rc1-mm2-cachefs/mm/memory.c
--- linux-2.6.9-rc1-mm2/mm/memory.c 2004-08-31 16:52:40.000000000 +0100
+++ linux-2.6.9-rc1-mm2-cachefs/mm/memory.c 2004-09-02 15:40:26.000000000 +0100
@@ -1030,6 +1030,54 @@ static inline void break_cow(struct vm_a
}

/*
+ * Make a PTE writeable for do_wp_page() on a shared-writable page
+ */
+static inline int do_wp_page_mk_pte_writable(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *page_table,
+ struct page *old_page,
+ pte_t pte)
+{
+ pte_t entry;
+
+ /* See if the VMA's owner wants to know that the page is about to
+ * become writable */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+ /* Notify the page owner without the lock held so they can
+ * sleep if they want to */
+ spin_unlock(&mm->page_table_lock);
+
+ if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+ goto bus_error;
+
+ spin_lock(&mm->page_table_lock);
+
+ /* Since we dropped the lock we need to revalidate the PTE as
+ * someone else may have changed it. If they did, we just
+ * return, as we can count on the MMU to tell us if they didn't
+ * also make it writable
+ */
+ if (!pte_same(*page_table, pte))
+ goto minor_fault;
+ }
+
+ flush_cache_page(vma, address);
+ entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
+ vma);
+ ptep_set_access_flags(vma, address, page_table, entry, 1);
+ update_mmu_cache(vma, address, entry);
+ pte_unmap(page_table);
+
+ minor_fault:
+ spin_unlock(&mm->page_table_lock);
+ return VM_FAULT_MINOR;
+
+ bus_error:
+ return VM_FAULT_SIGBUS;
+}
+
+/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
@@ -1054,7 +1102,6 @@ static int do_wp_page(struct mm_struct *
{
struct page *old_page, *new_page;
unsigned long pfn = pte_pfn(pte);
- pte_t entry;

if (unlikely(!pfn_valid(pfn))) {
/*
@@ -1073,16 +1120,11 @@ static int do_wp_page(struct mm_struct *
if (!TestSetPageLocked(old_page)) {
int reuse = can_share_swap_page(old_page);
unlock_page(old_page);
- if (reuse) {
- flush_cache_page(vma, address);
- entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
- vma);
- ptep_set_access_flags(vma, address, page_table, entry, 1);
- update_mmu_cache(vma, address, entry);
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_MINOR;
- }
+ if (reuse)
+ /* We can just make the PTE writable */
+ return do_wp_page_mk_pte_writable(mm, vma, address,
+ page_table, old_page,
+ pte);
}
pte_unmap(page_table);

@@ -1521,18 +1563,28 @@ retry:
/*
* Should we do an early C-O-W break?
*/
- if (write_access && !(vma->vm_flags & VM_SHARED)) {
- struct page *page;
+ if (write_access) {
+ if (!(vma->vm_flags & VM_SHARED)) {
+ struct page *page;

- if (unlikely(anon_vma_prepare(vma)))
- goto oom;
- page = alloc_page_vma(GFP_HIGHUSER, vma, address);
- if (!page)
- goto oom;
- copy_user_highpage(page, new_page, address);
- page_cache_release(new_page);
- new_page = page;
- anon = 1;
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+ page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ if (!page)
+ goto oom;
+ copy_user_highpage(page, new_page, address);
+ page_cache_release(new_page);
+ new_page = page;
+ anon = 1;
+
+ } else {
+ /* if the page will be shareable, see if the backing
+ * address space wants to know that the page is about
+ * to become writable */
+ if (vma->vm_ops->page_mkwrite &&
+ vma->vm_ops->page_mkwrite(vma, new_page) < 0)
+ return VM_FAULT_SIGBUS;
+ }
}

spin_lock(&mm->page_table_lock);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/