Re: [PATCH 3/3] AFS: Implement basic file write support

From: Andrew Morton
Date: Tue May 08 2007 - 20:01:39 EST


On Tue, 08 May 2007 20:44:11 +0100
David Howells <dhowells@xxxxxxxxxx> wrote:

> Implement support for writing to regular AFS files, including:
>
> (1) write
>
> (2) truncate
>
> (3) fsync, fdatasync
>
> (4) chmod, chown, chgrp, utime.
>
> AFS writeback attempts to batch writes into as chunks as large as it can manage
> up to the point that it writes back 65535 pages in one chunk or it meets a
> locked page.
>
> Furthermore, if a page has been written to using a particular key, then should
> another write to that page use some other key, the first write will be flushed
> before the second is allowed to take place. If the first write fails due to a
> security error, then the page will be scrapped and reread before the second
> write takes place.
>
> If a page is dirty and the callback on it is broken by the server, then the
> dirty data is not discarded (same behaviour as NFS).
>
> Shared-writable mappings are not supported by this patch.


The below isn't a review - it's some random cherrypickling.

> ...
>
> +int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
> + pgoff_t first, pgoff_t last,
> + unsigned offset, unsigned to,
> + const struct afs_wait_mode *wait_mode)
> +{
> + struct afs_vnode *vnode = wb->vnode;
> + struct afs_call *call;
> + loff_t size, pos, i_size;
> + __be32 *bp;
> +
> + _enter(",%x,{%x:%u},,",
> + key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
> +
> + size = to - offset;
> + if (first != last)
> + size += (loff_t)(last - first) << PAGE_SHIFT;
> + pos = (loff_t)first << PAGE_SHIFT;
> + pos += offset;
> +
> + i_size = i_size_read(&vnode->vfs_inode);
> + if (pos + size > i_size)
> + i_size = size + pos;
> +
> + _debug("size %llx, at %llx, i_size %llx",
> + (unsigned long long) size, (unsigned long long) pos,
> + (unsigned long long) i_size);
> +
> + BUG_ON(i_size > 0xffffffff); // TODO: use 64-bit store

You're sure this isn't user-triggerable?

> +static int afs_prepare_page(struct afs_vnode *vnode, struct page *page,
> + struct key *key, unsigned offset, unsigned to)
> +{
> + unsigned eof, tail, start, stop, len;
> + loff_t i_size, pos;
> + void *p;
> + int ret;
> +
> + _enter("");
> +
> + if (offset == 0 && to == PAGE_SIZE)
> + return 0;
> +
> + p = kmap(page);
> +
> + i_size = i_size_read(&vnode->vfs_inode);
> + pos = (loff_t) page->index << PAGE_SHIFT;
> + if (pos >= i_size) {
> + /* partial write, page beyond EOF */
> + _debug("beyond");
> + if (offset > 0)
> + memset(p, 0, offset);
> + if (to < PAGE_SIZE)
> + memset(p + to, 0, PAGE_SIZE - to);
> + kunmap(page);
> + return 0;
> + }
> +
> + if (i_size - pos >= PAGE_SIZE) {
> + /* partial write, page entirely before EOF */
> + _debug("before");
> + tail = eof = PAGE_SIZE;
> + } else {
> + /* partial write, page overlaps EOF */
> + eof = i_size - pos;
> + _debug("overlap %u", eof);
> + tail = max(eof, to);
> + if (tail < PAGE_SIZE)
> + memset(p + tail, 0, PAGE_SIZE - tail);
> + if (offset > eof)
> + memset(p + eof, 0, PAGE_SIZE - eof);
> + }
> +
> + kunmap(p);

kmap_atomic() could be used here and is better.

We have this zero_user_page() thing heading in which could perhaps be used
here also.


> + ret = 0;
> + if (offset > 0 || eof > to) {
> + /* need to fill one or two bits that aren't going to be written
> + * (cover both fillers in one read if there are two) */
> + start = (offset > 0) ? 0 : to;
> + stop = (eof > to) ? eof : offset;
> + len = stop - start;
> + _debug("wr=%u-%u av=0-%u rd=%u@%u",
> + offset, to, eof, start, len);
> + ret = afs_fill_page(vnode, key, start, len, page);
> + }
> +
> + _leave(" = %d", ret);
> + return ret;
> +}
> +
>
> ...

> + ASSERTRANGE(wb->first, <=, index, <=, wb->last);

wow.

> +}
> +
> +/*
> + * finalise part of a write to a page
> + */
> +int afs_commit_write(struct file *file, struct page *page,
> + unsigned offset, unsigned to)
> +{
> + struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
> + loff_t i_size, maybe_i_size;
> +
> + _enter("{%x:%u},{%lx},%u,%u",
> + vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
> +
> + maybe_i_size = (loff_t) page->index << PAGE_SHIFT;
> + maybe_i_size += to;
> +
> + i_size = i_size_read(&vnode->vfs_inode);
> + if (maybe_i_size > i_size) {
> + spin_lock(&vnode->writeback_lock);
> + i_size = i_size_read(&vnode->vfs_inode);
> + if (maybe_i_size > i_size)
> + i_size_write(&vnode->vfs_inode, maybe_i_size);
> + spin_unlock(&vnode->writeback_lock);
> + }
> +
> + set_page_dirty(page);
> +
> + if (PageDirty(page))
> + _debug("dirtied");
> +
> + return 0;
> +}

One would normally run mark_inode_dirty() after any i_size_write()?

> +/*
> + * kill all the pages in the given range
> + */

We can invalidate pages and we can truncate them and we can clean them.
But here we have a new operation, "killing". I wonder what that is.

> +static void afs_kill_pages(struct afs_vnode *vnode, bool error,
> + pgoff_t first, pgoff_t last)
> +{
> + struct pagevec pv;
> + unsigned count, loop;
> +
> + _enter("{%x:%u},%lx-%lx",
> + vnode->fid.vid, vnode->fid.vnode, first, last);
> +
> + pagevec_init(&pv, 0);
> +
> + do {
> + _debug("kill %lx-%lx", first, last);
> +
> + count = last - first + 1;
> + if (count > PAGEVEC_SIZE)
> + count = PAGEVEC_SIZE;
> + pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
> + first, count, pv.pages);
> + ASSERTCMP(pv.nr, ==, count);
> +
> + for (loop = 0; loop < count; loop++) {
> + ClearPageUptodate(pv.pages[loop]);
> + if (error)
> + SetPageError(pv.pages[loop]);
> + end_page_writeback(pv.pages[loop]);
> + }
> +
> + __pagevec_release(&pv);
> + } while (first < last);
> +
> + _leave("");
> +}
> +
> +/*
> + * write a page back to the server
> + * - the caller locked the page for us
> + */
> +int afs_writepage(struct page *page, struct writeback_control *wbc)
> +{
> + struct backing_dev_info *bdi = page->mapping->backing_dev_info;
> + struct afs_writeback *wb;
> + int ret;
> +
> + _enter("{%lx},", page->index);
> +
> + if (wbc->sync_mode != WB_SYNC_NONE)
> + wait_on_page_writeback(page);

Didn't the VFS already do that?

> + if (PageWriteback(page) || !PageDirty(page)) {
> + unlock_page(page);
> + return 0;
> + }

And some of that?

> + wb = (struct afs_writeback *) page_private(page);
> + ASSERT(wb != NULL);
> +
> + ret = afs_write_back_from_locked_page(wb, page);
> + unlock_page(page);
> + if (ret < 0) {
> + _leave(" = %d", ret);
> + return 0;
> + }
> +
> + wbc->nr_to_write -= ret;
> + if (wbc->nonblocking && bdi_write_congested(bdi))
> + wbc->encountered_congestion = 1;
> +
> + _leave(" = 0");
> + return 0;
> +}

I have this vague prehistoric memory that something can go wrong at the VFS
level if the address_space writes back more pages than it was asked to.
But I forget what the issue was and it would be silly to have an issue
with that anyway. Something to keep an eye out for.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/