[PATCH] loop: fastfs support

From: Jens Axboe
Date: Tue Jan 15 2008 - 04:06:34 EST


Add code to support redirecting IO directly to the filesystem blocks
instead of going through the page cache.

Signed-off-by: Jens Axboe <jens.axboe@xxxxxxxxxx>
---
drivers/block/loop.c | 466 +++++++++++++++++++++++++++++++++++++++++++++++++-
include/linux/loop.h | 13 ++
2 files changed, 472 insertions(+), 7 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b8af22e..ba46149 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -76,6 +76,7 @@
#include <linux/gfp.h>
#include <linux/kthread.h>
#include <linux/splice.h>
+#include <linux/extent_map.h>

#include <asm/uaccess.h>

@@ -481,16 +482,55 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
return ret;
}

+#define __lo_throttle(wq, lock, condition) \
+do { \
+ DEFINE_WAIT(__wait); \
+ for (;;) { \
+ prepare_to_wait((wq), &__wait, TASK_UNINTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ spin_unlock_irq((lock)); \
+ io_schedule(); \
+ spin_lock_irq((lock)); \
+ } \
+ finish_wait((wq), &__wait); \
+} while (0) \
+
+static inline int lo_act_bio(struct bio *bio)
+{
+ return bio->bi_bdev != NULL;
+}
+
+#define LO_BIO_THROTTLE 128
+
+/*
+ * A normal block device will throttle on request allocation. Do the same
+ * for loop to prevent millions of bio's queued internally.
+ */
+static void loop_bio_throttle(struct loop_device *lo, struct bio *bio)
+{
+ if (lo_act_bio(bio))
+ __lo_throttle(&lo->lo_bio_wait, &lo->lo_lock,
+ lo->lo_bio_cnt < LO_BIO_THROTTLE);
+}
+
/*
- * Add bio to back of pending list
+ * Add bio to back of pending list and wakeup thread
*/
static void loop_add_bio(struct loop_device *lo, struct bio *bio)
{
+ loop_bio_throttle(lo, bio);
+
if (lo->lo_biotail) {
lo->lo_biotail->bi_next = bio;
lo->lo_biotail = bio;
} else
lo->lo_bio = lo->lo_biotail = bio;
+
+ if (lo_act_bio(bio))
+ lo->lo_bio_cnt++;
+
+ wake_up(&lo->lo_event);
}

/*
@@ -510,6 +550,179 @@ static struct bio *loop_get_bio(struct loop_device *lo)
return bio;
}

+static void loop_exit_fastfs(struct loop_device *lo)
+{
+ /*
+ * drop what page cache we instantiated filling holes
+ */
+ invalidate_inode_pages2(lo->lo_backing_file->f_mapping);
+
+ blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_NONE, NULL);
+}
+
+static inline u64 lo_bio_offset(struct loop_device *lo, struct bio *bio)
+{
+ return (u64)lo->lo_offset + ((u64)bio->bi_sector << 9);
+}
+
+/*
+ * Find extent mapping this lo device block to the file block on the real
+ * device
+ */
+static struct extent_map *loop_lookup_extent(struct loop_device *lo,
+ u64 offset, gfp_t gfp_mask)
+{
+ struct address_space *mapping;
+ struct extent_map *em;
+ u64 len = 1 << lo->blkbits;
+
+ mapping = lo->lo_backing_file->f_mapping;
+ em = mapping->a_ops->map_extent(mapping, NULL, 0,
+ offset, len, 0, gfp_mask);
+ return em;
+}
+
+/*
+ * Alloc a hint bio to tell the loop thread to read file blocks for a given
+ * range
+ */
+static void loop_schedule_extent_mapping(struct loop_device *lo,
+ sector_t sector,
+ unsigned long len, int wait)
+{
+ struct bio *bio, stackbio;
+
+ /*
+ * it's ok if we occasionally fail. if called with blocking set,
+ * then use an on-stack bio since that must not fail.
+ */
+ if (wait) {
+ bio = &stackbio;
+ bio_init(bio);
+ } else
+ bio = bio_alloc(GFP_ATOMIC, 0);
+
+ if (bio) {
+ DECLARE_COMPLETION_ONSTACK(comp);
+
+ bio->bi_rw = LOOP_EXTENT_RW_MAGIC;
+ bio->bi_sector = sector;
+ bio->bi_size = len;
+
+ loop_add_bio(lo, bio);
+
+ if (wait) {
+ /*
+ * ok to set here, loop_add_bio() doesn't drop lock
+ * for this bio (!lo_act_bio(bio))
+ */
+ bio->bi_private = &comp;
+
+ /*
+ * never called with wait != 0 where it's not
+ * allowed to use spin_unlock_irq() which
+ * unconditionally enables interrupts.
+ */
+ spin_unlock_irq(&lo->lo_lock);
+ wait_for_completion(&comp);
+ spin_lock_irq(&lo->lo_lock);
+ }
+ }
+}
+
+static void loop_handle_extent_hole(struct loop_device *lo, struct bio *bio)
+{
+ /*
+ * for a read, just zero the data and end the io
+ */
+ if (bio_data_dir(bio) == READ) {
+ struct bio_vec *bvec;
+ unsigned long flags;
+ int i;
+
+ bio_for_each_segment(bvec, bio, i) {
+ char *dst = bvec_kmap_irq(bvec, &flags);
+
+ memset(dst, 0, bvec->bv_len);
+ bvec_kunmap_irq(dst, &flags);
+ }
+ bio_endio(bio, 0);
+ } else {
+ /*
+ * let the page cache handling path do this bio, and then
+ * lookup the mapped blocks after the io has been issued to
+ * instantiate extents.
+ */
+ loop_add_bio(lo, bio);
+ }
+}
+
+static inline int lo_is_switch_bio(struct bio *bio)
+{
+ return !bio->bi_bdev && bio->bi_rw == LOOP_SWITCH_RW_MAGIC;
+}
+
+static inline int lo_is_map_bio(struct bio *bio)
+{
+ return !bio->bi_bdev && bio->bi_rw == LOOP_EXTENT_RW_MAGIC;
+}
+
+/*
+ * Change mapping of the bio, so that it points to the real bdev and offset
+ */
+static int loop_redirect_bio(struct loop_device *lo, struct bio *bio)
+{
+ struct extent_map *lfe;
+ u64 extent_off;
+ u64 disk_block;
+ u64 start = lo_bio_offset(lo, bio);
+
+ lfe = loop_lookup_extent(lo, start, GFP_ATOMIC);
+ if (IS_ERR(lfe))
+ return -EIO;
+
+ while (!lfe) {
+ loop_schedule_extent_mapping(lo, bio->bi_sector,
+ bio->bi_size, 1);
+ lfe = loop_lookup_extent(lo, start, GFP_ATOMIC);
+ if (IS_ERR(lfe))
+ return -EIO;
+ }
+
+ /*
+ * handle sparse io
+ */
+ if (lfe->block_start == EXTENT_MAP_HOLE) {
+ loop_handle_extent_hole(lo, bio);
+ free_extent_map(lfe);
+ return 0;
+ }
+
+ /*
+ * not a hole, redirect
+ */
+ disk_block = lfe->block_start;
+ extent_off = start - lfe->start;
+ bio->bi_bdev = lfe->bdev;
+ bio->bi_sector = (disk_block + extent_off) >> 9;
+ free_extent_map(lfe);
+ return 1;
+}
+
+/*
+ * Wait on bio's on our list to complete before sending a barrier bio
+ * to the below device. Called with lo_lock held.
+ */
+static void loop_wait_on_bios(struct loop_device *lo)
+{
+ __lo_throttle(&lo->lo_bio_wait, &lo->lo_lock, !lo->lo_bio);
+}
+
+static void loop_wait_on_switch(struct loop_device *lo)
+{
+ __lo_throttle(&lo->lo_bio_wait, &lo->lo_lock, !lo->lo_switch);
+}
+
static int loop_make_request(struct request_queue *q, struct bio *old_bio)
{
struct loop_device *lo = q->queuedata;
@@ -525,15 +738,39 @@ static int loop_make_request(struct request_queue *q, struct bio *old_bio)
goto out;
if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
goto out;
+ if (lo->lo_flags & LO_FLAGS_FASTFS) {
+ /*
+ * If we get a barrier bio, then we just need to wait for
+ * existing bio's to be complete. This can only happen
+ * on the 'new' extent mapped loop, since that is the only
+ * one that supports barriers.
+ */
+ if (bio_barrier(old_bio))
+ loop_wait_on_bios(lo);
+
+ /*
+ * if file switch is in progress, wait for it to complete
+ */
+ if (!lo_is_switch_bio(old_bio) && lo->lo_switch)
+ loop_wait_on_switch(lo);
+
+ if (loop_redirect_bio(lo, old_bio))
+ goto out_redir;
+ goto out_end;
+ }
loop_add_bio(lo, old_bio);
- wake_up(&lo->lo_event);
spin_unlock_irq(&lo->lo_lock);
return 0;

out:
- spin_unlock_irq(&lo->lo_lock);
bio_io_error(old_bio);
+out_end:
+ spin_unlock_irq(&lo->lo_lock);
return 0;
+
+out_redir:
+ spin_unlock_irq(&lo->lo_lock);
+ return 1;
}

/*
@@ -547,21 +784,113 @@ static void loop_unplug(struct request_queue *q)
blk_run_address_space(lo->lo_backing_file->f_mapping);
}

+static void loop_unplug_fastfs(struct request_queue *q)
+{
+ struct loop_device *lo = q->queuedata;
+ struct request_queue *rq = bdev_get_queue(lo->fs_bdev);
+
+ clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+
+ if (rq->unplug_fn)
+ rq->unplug_fn(rq);
+}
+
struct switch_request {
struct file *file;
struct completion wait;
};

static void do_loop_switch(struct loop_device *, struct switch_request *);
+static int loop_init_fastfs(struct loop_device *);
+
+static void end_bio_hole_filling(struct bio *bio, int err)
+{
+ struct address_space *mapping = bio->bi_bdev->bd_inode->i_mapping;
+ struct bio *orig_bio = bio->bi_private;
+
+ if (mapping->a_ops->extent_io_complete) {
+ u64 start = orig_bio->bi_sector << 9;
+ u64 len = bio->bi_size;
+
+ mapping->a_ops->extent_io_complete(mapping, start, len);
+ }
+
+ bio_put(bio);
+ bio_endio(orig_bio, err);
+}
+
+static int fill_extent_hole(struct loop_device *lo, struct bio *bio)
+{
+ struct address_space *mapping = lo->lo_backing_file->f_mapping;
+ struct bio *new_bio;
+ struct extent_map *em;
+ u64 len = bio->bi_size;
+ u64 start = lo_bio_offset(lo, bio);
+ u64 disk_block;
+ u64 extent_off;
+
+ /*
+ * change the sector so we can find the correct file offset in our
+ * endio
+ */
+ bio->bi_sector = lo_bio_offset(lo, bio) >> 9;
+
+ mutex_lock(&mapping->host->i_mutex);
+
+ em = mapping->a_ops->map_extent(mapping, NULL, 0,
+ start, len, 1, GFP_KERNEL);
+ mark_inode_dirty(mapping->host);
+ mutex_unlock(&mapping->host->i_mutex);
+
+ if (em && !IS_ERR(em)) {
+ disk_block = em->block_start;
+ extent_off = start - em->start;
+
+ /*
+ * bio_clone() is mempool backed, so if __GFP_WAIT is set
+ * it wont ever fail
+ */
+ new_bio = bio_clone(bio, GFP_NOIO);
+ new_bio->bi_sector = (disk_block + extent_off) >> 9;
+ new_bio->bi_bdev = em->bdev;
+ new_bio->bi_private = bio;
+ new_bio->bi_size = bio->bi_size;
+ new_bio->bi_end_io = end_bio_hole_filling;
+ free_extent_map(em);
+
+ generic_make_request(new_bio);
+ return 0;
+ }
+
+ bio_endio(bio, -EIO);
+ return 0;
+}

static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
{
- if (unlikely(!bio->bi_bdev)) {
+ struct extent_map *lfe;
+
+ if (lo_is_map_bio(bio)) {
+ lfe = loop_lookup_extent(lo, lo_bio_offset(lo, bio),
+ GFP_KERNEL);
+ free_extent_map(lfe);
+ if (bio->bi_private)
+ complete(bio->bi_private);
+ else
+ bio_put(bio);
+ } else if (lo_is_switch_bio(bio)) {
do_loop_switch(lo, bio->bi_private);
bio_put(bio);
} else {
- int ret = do_bio_filebacked(lo, bio);
- bio_endio(bio, ret);
+ int ret;
+
+ if (lo->lo_flags & LO_FLAGS_FASTFS) {
+ /* we only get here when filling holes */
+ ret = fill_extent_hole(lo, bio);
+ } else {
+ ret = do_bio_filebacked(lo, bio);
+ bio_endio(bio, ret);
+ }
}
}

@@ -581,6 +910,7 @@ static int loop_thread(void *data)
{
struct loop_device *lo = data;
struct bio *bio;
+ int bio_act;

set_user_nice(current, -20);

@@ -588,7 +918,6 @@ static int loop_thread(void *data)

wait_event_interruptible(lo->lo_event,
lo->lo_bio || kthread_should_stop());
-
if (!lo->lo_bio)
continue;
spin_lock_irq(&lo->lo_lock);
@@ -596,7 +925,16 @@ static int loop_thread(void *data)
spin_unlock_irq(&lo->lo_lock);

BUG_ON(!bio);
+
+ bio_act = lo_act_bio(bio);
loop_handle_bio(lo, bio);
+
+ spin_lock_irq(&lo->lo_lock);
+ if (bio_act)
+ lo->lo_bio_cnt--;
+ if (lo->lo_bio_cnt < LO_BIO_THROTTLE || !lo->lo_bio)
+ wake_up(&lo->lo_bio_wait);
+ spin_unlock_irq(&lo->lo_lock);
}

return 0;
@@ -617,6 +955,8 @@ static int loop_switch(struct loop_device *lo, struct file *file)
w.file = file;
bio->bi_private = &w;
bio->bi_bdev = NULL;
+ bio->bi_rw = LOOP_SWITCH_RW_MAGIC;
+ lo->lo_switch = 1;
loop_make_request(lo->lo_queue, bio);
wait_for_completion(&w.wait);
return 0;
@@ -630,6 +970,10 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
struct file *file = p->file;
struct file *old_file = lo->lo_backing_file;
struct address_space *mapping = file->f_mapping;
+ const int fastfs = lo->lo_flags & LO_FLAGS_FASTFS;
+
+ if (fastfs)
+ loop_exit_fastfs(lo);

mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
lo->lo_backing_file = file;
@@ -637,6 +981,13 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+ if (fastfs)
+ loop_init_fastfs(lo);
+
+ lo->lo_switch = 0;
+ wake_up(&lo->lo_bio_wait);
+
complete(&p->wait);
}

@@ -700,6 +1051,83 @@ static int loop_change_fd(struct loop_device *lo, struct file *lo_file,
return error;
}

+/*
+ * See if adding this bvec would cause us to spill into a new extent. If so,
+ * disallow the add to start a new bio. This ensures that the bio we receive
+ * in loop_make_request() never spans two extents or more.
+ */
+static int loop_merge_bvec(struct request_queue *q, struct bio *bio,
+ struct bio_vec *bvec)
+{
+ struct loop_device *lo = q->queuedata;
+ struct extent_map *lfe;
+ unsigned int ret;
+ u64 start;
+ u64 len;
+
+ start = lo_bio_offset(lo, bio);
+ len = bio->bi_size + bvec->bv_len;
+ ret = bvec->bv_len;
+
+ lfe = loop_lookup_extent(lo, start, GFP_ATOMIC);
+ if (lfe && !IS_ERR(lfe)) {
+ /*
+ * have extent, disallow if outside that extent
+ */
+ if (start + len > lfe->start + lfe->len)
+ ret = 0;
+
+ free_extent_map(lfe);
+ } else {
+ if (bio->bi_size)
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * Initialize the members pertaining to extent mapping. We will populate
+ * the tree lazily on demand, as a full scan of a big file can take some
+ * time.
+ */
+static int loop_init_fastfs(struct loop_device *lo)
+{
+ struct file *file = lo->lo_backing_file;
+ struct inode *inode = file->f_mapping->host;
+ struct request_queue *fs_q;
+ int ret;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ /*
+ * Need a working extent_map
+ */
+ if (inode->i_mapping->a_ops->map_extent == NULL)
+ return -EINVAL;
+ /*
+ * invalidate all page cache belonging to this file, it could become
+ * stale when we directly overwrite blocks.
+ */
+ ret = invalidate_inode_pages2(file->f_mapping);
+ if (unlikely(ret))
+ return ret;
+
+ lo->blkbits = inode->i_blkbits;
+ lo->fs_bdev = file->f_mapping->host->i_sb->s_bdev;
+ lo->lo_flags |= LO_FLAGS_FASTFS;
+ lo->lo_queue->unplug_fn = loop_unplug_fastfs;
+
+ blk_queue_merge_bvec(lo->lo_queue, loop_merge_bvec);
+ blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN, NULL);
+
+ fs_q = bdev_get_queue(lo->fs_bdev);
+ blk_queue_stack_limits(lo->lo_queue, fs_q);
+
+ printk(KERN_INFO "loop%d: fast redirect\n", lo->lo_number);
+ return 0;
+}
+
static inline int is_loop_device(struct file *file)
{
struct inode *i = file->f_mapping->host;
@@ -748,6 +1176,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,

mapping = file->f_mapping;
inode = mapping->host;
+ lo->lo_flags = 0;

if (!(file->f_mode & FMODE_WRITE))
lo_flags |= LO_FLAGS_READ_ONLY;
@@ -811,6 +1240,12 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,

set_blocksize(bdev, lo_blocksize);

+ /*
+ * This needs to be done after setup with another ioctl,
+ * not automatically like this.
+ */
+ loop_init_fastfs(lo);
+
lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
lo->lo_number);
if (IS_ERR(lo->lo_thread)) {
@@ -896,6 +1331,9 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)

kthread_stop(lo->lo_thread);

+ if (lo->lo_flags & LO_FLAGS_FASTFS)
+ loop_exit_fastfs(lo);
+
lo->lo_backing_file = NULL;

loop_release_xfer(lo);
@@ -943,6 +1381,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
if (info->lo_encrypt_type) {
unsigned int type = info->lo_encrypt_type;

+ if (lo->lo_flags & LO_FLAGS_FASTFS)
+ return -EINVAL;
+
if (type >= MAX_LO_CRYPT)
return -EINVAL;
xfer = xfer_funcs[type];
@@ -951,6 +1392,13 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
} else
xfer = NULL;

+ /*
+ * for remaps, offset must be a multiple of full blocks
+ */
+ if ((lo->lo_flags & LO_FLAGS_FASTFS) &&
+ (((1 << lo->blkbits) - 1) & info->lo_offset))
+ return -EINVAL;
+
err = loop_init_xfer(lo, xfer, info);
if (err)
return err;
@@ -1153,6 +1601,9 @@ static int lo_ioctl(struct inode * inode, struct file * file,
case LOOP_GET_STATUS64:
err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
break;
+ case LOOP_SET_FASTFS:
+ err = loop_init_fastfs(lo);
+ break;
default:
err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
}
@@ -1412,6 +1863,7 @@ static struct loop_device *loop_alloc(int i)
lo->lo_number = i;
lo->lo_thread = NULL;
init_waitqueue_head(&lo->lo_event);
+ init_waitqueue_head(&lo->lo_bio_wait);
spin_lock_init(&lo->lo_lock);
disk->major = LOOP_MAJOR;
disk->first_minor = i;
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 26a0a10..7b3cb27 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -50,22 +50,28 @@ struct loop_device {

struct file * lo_backing_file;
struct block_device *lo_device;
+ struct block_device *fs_bdev;
unsigned lo_blocksize;
void *key_data;
+ unsigned int lo_switch;

gfp_t old_gfp_mask;

spinlock_t lo_lock;
struct bio *lo_bio;
struct bio *lo_biotail;
+ unsigned int lo_bio_cnt;
int lo_state;
struct mutex lo_ctl_mutex;
struct task_struct *lo_thread;
wait_queue_head_t lo_event;
+ wait_queue_head_t lo_bio_wait;

struct request_queue *lo_queue;
struct gendisk *lo_disk;
struct list_head lo_list;
+
+ unsigned int blkbits;
};

#endif /* __KERNEL__ */
@@ -76,6 +82,7 @@ struct loop_device {
enum {
LO_FLAGS_READ_ONLY = 1,
LO_FLAGS_USE_AOPS = 2,
+ LO_FLAGS_FASTFS = 4,
};

#include <asm/posix_types.h> /* for __kernel_old_dev_t */
@@ -159,5 +166,11 @@ int loop_unregister_transfer(int number);
#define LOOP_SET_STATUS64 0x4C04
#define LOOP_GET_STATUS64 0x4C05
#define LOOP_CHANGE_FD 0x4C06
+#define LOOP_SET_FASTFS 0x4C07
+
+enum {
+ LOOP_EXTENT_RW_MAGIC = 0x19283746,
+ LOOP_SWITCH_RW_MAGIC = 0xfeedbeef,
+};

#endif
--
1.5.4.rc2.84.gf85fd


--Y/WcH0a6A93yCHGr--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/