Re: bcache super block corruption with non 4k pages
From: Stefan Bader
Date: Tue Jul 26 2016 - 05:51:51 EST
On 21.07.2016 10:58, Stefan Bader wrote:
> I was pointed at the thread which seems to address the same after
> I wrote most of below text. Did not want to re-write this so please
> bear with the odd layout.
>
> https://www.redhat.com/archives/dm-devel/2016-June/msg00015.html
>
> Zhengyuan tries to fix the problem by relocating the superblock on
> disk. But I am not sure whether there is really any guarantee about
> how __bread fills data into the buffer_head. What if there is the next
> odd arch with 128K pages?
>
> So below is an attempt to be more generic. Still I don't feel completely
> happy with the way that a page moves (or is shared) between buffer_head
> and biovec. What I tried to outline below is to let the register functions
> allocate bio+biovec memory and use the in-memory sb_cache data to initialize
> the biovec buffer.
Any opinions here? Also adding LKML as I don't seem to get through moderation on
dm-devel.
>
> -Stefan
>
>
> ---
>
> This was seen on ppc64le with 64k page size. The problem is that
> in that case __bread returns a page where b_data is not at the
> start of the page. And I am not really sure that the way bcache
> re-purposes the page returned by __bread to be used as biovec
> element is really a good idea.
>
> The way it is now, I saw __bread return a 64k page where b_data
> was starting at 4k offset. Then __write_super was modifying some
> data at offset 0 but for example not writing the magic number again.
>
> Not sure why (maybe this messes up flushes, too) but the bad data was not
> immediately written back when the devices were registered. So at that time
> bcache-super-show would report data as it was written by user-space (like
> the cache type still 0 and not 3, and claiming the cache to still bei
> detached).
>
> But when stopping the cache and unregistering the cache set this changed
> and bcache-super-show was reporting a bad magic number (as expected).
>
> The patch below works around this (tested with 64k and 4k pages) but I
> am not really sure this is a clean approach. My gut feeling would be that
> the bio structures should be allocated speperately (I think there is a way
> of allocating a bioset without using a pool). Then that separate page could
> be pre-initialized from the super block structure without passing around
> a page the feels more private to __bread usage...
>
> -Stefan
>
>
>
> From 3652e98359b876f3c1e6d7b9b7305e3411178296 Mon Sep 17 00:00:00 2001
> From: Stefan Bader <stefan.bader@xxxxxxxxxxxxx>
> Date: Wed, 20 Jul 2016 12:06:27 +0200
> Subject: [PATCH] bcache: handle non 4k pages returned by __bread
>
> On non-x86 arches pages can be bigger than 4k. Currently read_super
> returns a reference to the page used as buffer in the buffer_head
> that is returned by __bread().
> With 4k page size and a requested read this normally ends up with
> the super block data starting at offset 0. But as seen on ppc64le
> with 64k page size, the data can start at an offset (from what I
> saw the offset was 4k).
> This causes harm later on when __write_super() maps the super
> block data at the start of the page acquired before and also
> not writes back all fields (particularly the super block magic).
>
> Try to avoid this by also returning the potential offset within the
> sb_page from read_super() and fully initiallize the single bvec of
> the bio used for __write_super() calls. Doing that is the same as
> would have been done in bch_bio_map() which now must not be used in
> __write_super().
>
> Signed-off-by: Stefan Bader <stefan.bader@xxxxxxxxxxxxx>
>
> removedebug
>
> Signed-off-by: Stefan Bader <stefan.bader@xxxxxxxxxxxxx>
> ---
> drivers/md/bcache/super.c | 22 +++++++++++++++-------
> 1 file changed, 15 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index e169739..3e0d2de 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -62,7 +62,7 @@ struct workqueue_struct *bcache_wq;
> /* Superblock */
>
> static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
> - struct page **res)
> + struct page **res, unsigned int *off)
> {
> const char *err;
> struct cache_sb *s;
> @@ -192,6 +192,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
> err = NULL;
>
> get_page(bh->b_page);
> + *off = (unsigned int) (bh->b_data - ((char *) page_address(bh->b_page)));
> *res = bh->b_page;
> err:
> put_bh(bh);
> @@ -202,19 +203,19 @@ static void write_bdev_super_endio(struct bio *bio)
> {
> struct cached_dev *dc = bio->bi_private;
> /* XXX: error checking */
> -
> closure_put(&dc->sb_write);
> }
>
> static void __write_super(struct cache_sb *sb, struct bio *bio)
> {
> - struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
> + struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page) +
> + bio->bi_io_vec[0].bv_offset;
> unsigned i;
>
> bio->bi_iter.bi_sector = SB_SECTOR;
> bio->bi_rw = REQ_SYNC|REQ_META;
> bio->bi_iter.bi_size = SB_SIZE;
> - bch_bio_map(bio, NULL);
> + // bch_bio_map(bio, NULL);
>
> out->offset = cpu_to_le64(sb->offset);
> out->version = cpu_to_le64(sb->version);
> @@ -1139,6 +1140,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
> /* Cached device - bcache superblock */
>
> static void register_bdev(struct cache_sb *sb, struct page *sb_page,
> + unsigned int sb_off,
> struct block_device *bdev,
> struct cached_dev *dc)
> {
> @@ -1154,6 +1156,8 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
> dc->sb_bio.bi_max_vecs = 1;
> dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs;
> dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
> + dc->sb_bio.bi_io_vec[0].bv_len = SB_SIZE;
> + dc->sb_bio.bi_io_vec[0].bv_offset = sb_off;
> get_page(sb_page);
>
> if (cached_dev_init(dc, sb->block_size << 9))
> @@ -1839,6 +1843,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
> }
>
> static int register_cache(struct cache_sb *sb, struct page *sb_page,
> + unsigned int sb_off,
> struct block_device *bdev, struct cache *ca)
> {
> char name[BDEVNAME_SIZE];
> @@ -1853,6 +1858,8 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
> ca->sb_bio.bi_max_vecs = 1;
> ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs;
> ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
> + ca->sb_bio.bi_io_vec[0].bv_len = SB_SIZE;
> + ca->sb_bio.bi_io_vec[0].bv_offset = sb_off;
> get_page(sb_page);
>
> if (blk_queue_discard(bdev_get_queue(ca->bdev)))
> @@ -1936,6 +1943,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> struct cache_sb *sb = NULL;
> struct block_device *bdev = NULL;
> struct page *sb_page = NULL;
> + unsigned int sb_off;
>
> if (!try_module_get(THIS_MODULE))
> return -EBUSY;
> @@ -1967,7 +1975,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> if (set_blocksize(bdev, 4096))
> goto err_close;
>
> - err = read_super(sb, bdev, &sb_page);
> + err = read_super(sb, bdev, &sb_page, &sb_off);
> if (err)
> goto err_close;
>
> @@ -1977,14 +1985,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
> goto err_close;
>
> mutex_lock(&bch_register_lock);
> - register_bdev(sb, sb_page, bdev, dc);
> + register_bdev(sb, sb_page, sb_off, bdev, dc);
> mutex_unlock(&bch_register_lock);
> } else {
> struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
> if (!ca)
> goto err_close;
>
> - if (register_cache(sb, sb_page, bdev, ca) != 0)
> + if (register_cache(sb, sb_page, sb_off, bdev, ca) != 0)
> goto err_close;
> }
> out:
>
Attachment:
signature.asc
Description: OpenPGP digital signature