[ 100/101] pnfsblock: fix non-aligned DIO read

From: Greg Kroah-Hartman
Date: Mon Oct 29 2012 - 17:38:10 EST


3.6-stable review patch. If anyone has any objections, please let me know.

------------------

From: Peng Tao <bergwolf@xxxxxxxxx>

commit f742dc4a32587bff50b13dde9d8894b96851951a upstream.

For DIO read, if it is not sector aligned, we should reject it
and resend via MDS. Otherwise there might be data corruption.
Also teach bl_read_pagelist to handle partial page reads for DIO.

Signed-off-by: Peng Tao <tao.peng@xxxxxxx>
Signed-off-by: Trond Myklebust <Trond.Myklebust@xxxxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>

---
fs/nfs/blocklayout/blocklayout.c | 77 +++++++++++++++++++++++----------------
1 file changed, 46 insertions(+), 31 deletions(-)

--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -242,14 +242,6 @@ bl_end_par_io_read(void *data, int unuse
schedule_work(&rdata->task.u.tk_work);
}

-static bool
-bl_check_alignment(u64 offset, u32 len, unsigned long blkmask)
-{
- if ((offset & blkmask) || (len & blkmask))
- return false;
- return true;
-}
-
static enum pnfs_try_status
bl_read_pagelist(struct nfs_read_data *rdata)
{
@@ -260,15 +252,15 @@ bl_read_pagelist(struct nfs_read_data *r
sector_t isect, extent_length = 0;
struct parallel_io *par;
loff_t f_offset = rdata->args.offset;
+ size_t bytes_left = rdata->args.count;
+ unsigned int pg_offset, pg_len;
struct page **pages = rdata->args.pages;
int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+ const bool is_dio = (header->dreq != NULL);

dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);

- if (!bl_check_alignment(f_offset, rdata->args.count, PAGE_CACHE_MASK))
- goto use_mds;
-
par = alloc_parallel(rdata);
if (!par)
goto use_mds;
@@ -298,36 +290,53 @@ bl_read_pagelist(struct nfs_read_data *r
extent_length = min(extent_length, cow_length);
}
}
+
+ if (is_dio) {
+ pg_offset = f_offset & ~PAGE_CACHE_MASK;
+ if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
+ pg_len = PAGE_CACHE_SIZE - pg_offset;
+ else
+ pg_len = bytes_left;
+
+ f_offset += pg_len;
+ bytes_left -= pg_len;
+ isect += (pg_offset >> SECTOR_SHIFT);
+ } else {
+ pg_offset = 0;
+ pg_len = PAGE_CACHE_SIZE;
+ }
+
hole = is_hole(be, isect);
if (hole && !cow_read) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
dprintk("%s Zeroing page for hole\n", __func__);
- zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+ zero_user_segment(pages[i], pg_offset, pg_len);
print_page(pages[i]);
SetPageUptodate(pages[i]);
} else {
struct pnfs_block_extent *be_read;

be_read = (hole && cow_read) ? cow_read : be;
- bio = bl_add_page_to_bio(bio, rdata->pages.npages - i,
+ bio = do_add_page_to_bio(bio, rdata->pages.npages - i,
READ,
isect, pages[i], be_read,
- bl_end_io_read, par);
+ bl_end_io_read, par,
+ pg_offset, pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
goto out;
}
}
- isect += PAGE_CACHE_SECTORS;
+ isect += (pg_len >> SECTOR_SHIFT);
extent_length -= PAGE_CACHE_SECTORS;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
rdata->res.eof = 1;
- rdata->res.count = header->inode->i_size - f_offset;
+ rdata->res.count = header->inode->i_size - rdata->args.offset;
} else {
- rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
+ rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset;
}
out:
bl_put_extent(be);
@@ -676,7 +685,7 @@ bl_write_pagelist(struct nfs_write_data
struct bio *bio = NULL;
struct pnfs_block_extent *be = NULL, *cow_read = NULL;
sector_t isect, last_isect = 0, extent_length = 0;
- struct parallel_io *par = NULL;
+ struct parallel_io *par;
loff_t offset = wdata->args.offset;
size_t count = wdata->args.count;
unsigned int pg_offset, pg_len, saved_len;
@@ -688,10 +697,6 @@ bl_write_pagelist(struct nfs_write_data
NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;

dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
- /* Check for alignment first */
- if (!bl_check_alignment(offset, count, PAGE_CACHE_MASK))
- goto out_mds;
-
/* At this point, wdata->pages is a (sequential) list of nfs_pages.
* We want to write each, and if there is an error set pnfs_error
* to have it redone using nfs.
@@ -1164,32 +1169,42 @@ bl_clear_layoutdriver(struct nfs_server
return 0;
}

+static bool
+is_aligned_req(struct nfs_page *req, unsigned int alignment)
+{
+ return IS_ALIGNED(req->wb_offset, alignment) &&
+ IS_ALIGNED(req->wb_bytes, alignment);
+}
+
static void
bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
- if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK))
+ if (pgio->pg_dreq != NULL &&
+ !is_aligned_req(req, SECTOR_SIZE))
nfs_pageio_reset_read_mds(pgio);
else
pnfs_generic_pg_init_read(pgio, req);
}

-static void
-bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+static bool
+bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
{
- if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK))
- nfs_pageio_reset_write_mds(pgio);
- else
- pnfs_generic_pg_init_write(pgio, req);
+ if (pgio->pg_dreq != NULL &&
+ !is_aligned_req(req, SECTOR_SIZE))
+ return false;
+
+ return pnfs_generic_pg_test(pgio, prev, req);
}

static const struct nfs_pageio_ops bl_pg_read_ops = {
.pg_init = bl_pg_init_read,
- .pg_test = pnfs_generic_pg_test,
+ .pg_test = bl_pg_test_read,
.pg_doio = pnfs_generic_pg_readpages,
};

static const struct nfs_pageio_ops bl_pg_write_ops = {
- .pg_init = bl_pg_init_write,
+ .pg_init = pnfs_generic_pg_init_write,
.pg_test = pnfs_generic_pg_test,
.pg_doio = pnfs_generic_pg_writepages,
};


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/