[PATCH 3/3] Bcache: version 4

From: Kent Overstreet
Date: Fri Apr 30 2010 - 20:14:07 EST


block/Makefile | 2 +
block/bcache.c | 2624 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 2626 insertions(+), 0 deletions(-)

diff --git a/block/Makefile b/block/Makefile
index cb2d515..e9b5fc0 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -15,3 +15,5 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o

obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
+
+obj-$(CONFIG_BLK_CACHE) += bcache.o
diff --git a/block/bcache.c b/block/bcache.c
new file mode 100644
index 0000000..0f26277
--- /dev/null
+++ b/block/bcache.c
@@ -0,0 +1,2624 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@xxxxxxxxx>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#define pr_fmt(fmt) "bcache: %s() " fmt, __func__
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/*
+ * Todo:
+ * Collect buckets that are marked as btree buckets but aren't in use anymore
+ * during garbage collection
+ * Need to flag cache devices that are dirty and invalidate the contents if it
+ * wasn't clean on register, as the priorities/gens will be out of sync.
+ * IO error handling
+ * Need to wait somehow till issued bio is on the request queue before we drop
+ * the reference on the bucket, or there's still a race between that and free's
+ * blkdev_issue_discard
+ * Need refcounting of data buckets to prevent a race between use and reuse
+ * Make btree_clean check for pointers to btree buckets that aren't in use
+ * Complete bios that were split correctly - done, untested
+ * Make btree_clean handle duplicate keys
+ * Multiple open data buckets so as to not fragment multiple streams of
+ * contiguous IO; also use pid or hash of pid to make random IO from a process
+ * go into the same bucket.
+ *
+ * Future:
+ * Journalling
+ * Write behind
+ * Checksumming
+ * SSDs that don't support trim would probably benefit from batching up writes
+ * to a full erase block.
+ *
+ * Stuff that should be made generic and taken out:
+ * fifos
+ * heap code
+ * bio_split_front()
+ * maybe eventually the search context stuff
+ */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@xxxxxxxxx>");
+
+#define UUIDS_PER_SB 256
+#define SB_SECTOR 8
+#define UUID_SECTOR 16
+#define PRIO_SECTOR 24
+
+/*
+ * Page 0: unused
+ * Page 1: superblock
+ * Page 2: device UUIDs
+ * Page 3+: bucket priorities
+ */
+
+#define DECLARE_FIFO(type, name) \
+ struct { \
+ size_t front, back, size; \
+ type *data; \
+ } name;
+
+#define init_fifo(f, s, gfp) ({ \
+ (f)->data = NULL; \
+ (f)->front = (f)->back = 0; \
+ (f)->size = roundup_pow_of_two(s) - 1; \
+ if ((f)->size * sizeof(*(f)->data) >= KMALLOC_MAX_SIZE) \
+ (f)->data = vmalloc((f)->size * sizeof(*(f)->data));\
+ else if ((f)->size > 0) \
+ (f)->data = kmalloc((f)->size * sizeof(*(f)->data), gfp);\
+ (f)->data; })
+
+#define free_fifo(f) do { \
+ if ((f)->size * sizeof(*(f)->data) >= KMALLOC_MAX_SIZE) \
+ vfree((f)->data); \
+ else if ((f)->size > 0) \
+ kfree((f)->data); \
+} while (0)
+
+#define fifo_push(f, i) ({ \
+ bool _r = false; \
+ if ((((f)->front + 1) & (f)->size) != (f)->back) { \
+ _r = true; \
+ (f)->data[(f)->front++] = i; \
+ (f)->front &= (f)->size; \
+ } _r; })
+
+#define fifo_pop(f, i) ({ \
+ bool _r = false; \
+ if ((f)->front != (f)->back) { \
+ _r = true; \
+ i = (f)->data[(f)->back++]; \
+ (f)->back &= (f)->size; \
+ } _r; })
+
+#define fifo_full(f) ((((f)->front + 1) & (f)->size) == (f)->back)
+
+#define DECLARE_HEAP(type, name) \
+ struct { \
+ size_t size; \
+ type *data; \
+ } name;
+
+#define init_heap(h, s, gfp) ({ \
+ (h)->data = NULL; \
+ (h)->size = 0; \
+ if (s * sizeof(*(h)->data) >= KMALLOC_MAX_SIZE) \
+ (h)->data = vmalloc(s * sizeof(*(h)->data)); \
+ else if (s > 0) \
+ (h)->data = kmalloc(s * sizeof(*(h)->data), gfp);\
+ (h)->data; })
+
+struct search_context;
+struct cached_bucket;
+
+typedef void (search_fn) (void *, struct bio *, struct search_context *);
+
+#define CACHE_CLEAN 1
+
+struct cache_sb {
+ uint8_t magic[16];
+ uint32_t version;
+ uint16_t block_size; /* sectors */
+ uint16_t bucket_size; /* sectors */
+ uint32_t journal_start; /* buckets */
+ uint32_t first_bucket; /* start of data */
+ uint64_t nbuckets; /* device size */
+ uint64_t btree_root;
+ uint16_t btree_level;
+};
+
+struct bucket {
+ size_t heap;
+ atomic_t pin;
+ uint16_t priority;
+ uint8_t generation;
+ uint8_t last_gc;
+};
+
+struct bucket_disk {
+ uint16_t priority;
+ uint8_t generation;
+} __attribute((packed));
+
+struct btree_node_header {
+ uint32_t csum;
+ uint32_t nkeys;
+ uint64_t random;
+};
+
+struct btree_key {
+ uint64_t key;
+ uint64_t ptr;
+};
+
+struct cache_device {
+ struct list_head list;
+ struct cache_sb sb;
+ struct page *sb_page;
+ spinlock_t sb_lock;
+
+ struct kobject kobj;
+ struct block_device *bdev;
+ struct module *owner;
+ struct work_struct work;
+
+ /*
+ * Buckets used for cached data go on the heap. The heap is ordered by
+ * bucket->priority; a priority of ~0 indicates a btree bucket. Priority
+ * is increased on cache hit, and periodically all the buckets on the
+ * heap have their priority scaled down by a linear function.
+ */
+ spinlock_t bucket_lock;
+ struct bucket **heap;
+ struct bucket *buckets;
+ struct bucket_disk *disk_buckets;
+ unsigned long heap_size;
+ long rescale;
+
+ uint8_t need_gc;
+ uint8_t *gc_in_progress;
+ struct rw_semaphore gc_lock;
+
+ int btree_buckets_cached;
+ struct list_head lru;
+
+ DECLARE_FIFO(long, free);
+
+ /*struct gendisk *devices[UUIDS_PER_SB];*/
+ short devices[UUIDS_PER_SB];
+ struct buffer_head *uuids;
+
+ long current_bucket;
+ struct btree_key current_key;
+ int sectors_free;
+ unsigned long last_used;
+
+ unsigned long cache_hits;
+
+ struct cached_bucket *root;
+};
+
+struct open_bucket {
+ struct list_head list;
+ struct cache_device *cache;
+
+ char identifier;
+ long bucket;
+ struct btree_key key;
+ int sectors_free;
+};
+
+struct cached_dev {
+ struct kobject kobj;
+ struct block_device *bdev;
+ struct module *owner;
+ struct work_struct work;
+};
+
+struct journal_header {
+ uint32_t csum;
+ uint32_t seq;
+ uint32_t last_open_entry;
+ uint32_t nr_entries;
+};
+
+struct cached_bucket {
+ struct rw_semaphore lock;
+ struct list_head lru;
+ struct search_context *wait;
+ struct cache_device *c; /* for bio_add_work_unlock */
+
+ atomic_t nread;
+ sector_t offset;
+ int level;
+
+ struct page *pages[];
+};
+
+struct search_context {
+ struct work_struct w;
+ atomic_t remaining;
+ struct search_context *parent;
+ struct search_context *next;
+ search_fn *end_fn;
+
+ struct btree_key new_keys[2];
+ void *q;
+ struct bio *bio;
+ int error;
+
+ sector_t bi_sector;
+ bio_end_io_t *bi_end_io;
+ void *bi_private;
+};
+
+static const char bcache_magic[] = {
+ 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
+ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 };
+
+static struct kobject *bcache_kobj;
+/*
+ * We need a real search key
+ * static struct gendisk *devices[UUIDS_PER_SB];
+ */
+static char uuids[UUIDS_PER_SB*16];
+
+static LIST_HEAD(cache_devices);
+static LIST_HEAD(open_buckets);
+
+static struct workqueue_struct *delayed;
+
+/*
+ * Sysfs vars / tunables
+ */
+static unsigned long cache_hits, cache_misses, rescale = 20480; /* sectors */
+static uint16_t initial_priority = SHORT_MAX
+static uint16_t cache_hit_priority = 100, cache_hit_seek = 100;
+
+static void do_btree_gc(struct work_struct *w);
+static void unregister_cache(struct kobject *k);
+static void write_super(struct cache_device *c);
+static void run_search(struct work_struct *w);
+static void save_priorities(struct cache_device *c);
+static void __btree_write_node(struct cache_device *c, struct cached_bucket *b,
+ int skip, int n);
+
+#define pages_per_btree (c->sb.bucket_size >> (PAGE_SHIFT - 9))
+#define keys_per_page (PAGE_SIZE / sizeof(struct btree_key))
+#define bucket_to_sector(b) (((sector_t) (b) + c->sb.first_bucket) * c->sb.bucket_size)
+#define sector_to_bucket(s) ((long) ((s) / c->sb.bucket_size) - c->sb.first_bucket)
+#define sector_to_gen(s) ({ smp_rmb(); c->buckets[sector_to_bucket(s)].generation; })
+#define sector_to_priority(s) ({ smp_rmb(); c->buckets[sector_to_bucket(s)].priority; })
+#define ptr_to_bucket(p) sector_to_bucket(PTR_OFFSET(p))
+#define bucket_to_ptr(b) TREE_PTR(sector_to_gen(b->offset), 0, b->offset)
+#define data(b) ((struct btree_key **) &(b)->pages[pages_per_btree])
+#define keys(i) (((struct btree_node_header *) *(i))->nkeys)
+#define rand(i) (((struct btree_node_header *) *(i))->random)
+#define header(b) ((struct btree_node_header *) data(b)[0])
+#define index(i, b) (i - data(b))
+#define last_key(i) (node(i, keys(i))->key)
+#define bucket_key(b) ((struct btree_key) { \
+ .key = node(data(b), header(b)->nkeys)->key,\
+ .ptr = bucket_to_ptr(b) })
+
+#define label(l, foo) if (0) { l: foo; }
+
+/*
+ * key: 8 bit device, 56 bit offset
+ * value: 8 bit generation, 16 bit length, 40 bit offset
+ * All units are in sectors
+ */
+
+static inline struct btree_key *node(struct btree_key *d[], int i)
+{
+ return d[i / keys_per_page] + (i % keys_per_page);
+}
+
+#define TREE_KEY(device, offset) ((offset) | ((uint64_t) device) << 56)
+#define KEY_DEV(p) (p >> 56)
+#define KEY_OFFSET(p) (p & ~((uint64_t) ~0 << 56))
+#define TREE_KEY_DEV(page, i) KEY_DEV(node(page, i)->key)
+#define TREE_KEY_OFFSET(page, i) KEY_OFFSET(node(page, i)->key)
+
+#define TREE_PTR(gen, length, offset) ((gen) | (length) << 8 | (uint64_t) (offset) << 24)
+#define PTR_GEN(p) ((uint8_t) ((p) & ~(~0 << 8)))
+#define PTR_LENGTH(p) (((p) >> 8) & ~(~0 << 16))
+#define PTR_OFFSET(p) ((p) >> 24)
+#define TREE_PTR_GEN(page, i) PTR_GEN(node(page, i)->ptr)
+#define TREE_PTR_LENGTH(page, i) PTR_LENGTH(node(page, i)->ptr)
+#define TREE_PTR_OFFSET(page, i) PTR_OFFSET(node(page, i)->ptr)
+
+static inline void rw_lock(bool write, struct rw_semaphore *lock, int class)
+{ write ? down_write_nested(lock, class) : down_read_nested(lock, class); }
+
+static inline void rw_unlock(bool write, struct rw_semaphore *lock)
+{ write ? up_write(lock) : up_read(lock); }
+
+static int is_zero(void *p, size_t n)
+{
+ int i;
+ for (i = 0; i < n; i++)
+ if (((char *) p)[i])
+ return 0;
+ return 1;
+}
+
+static int lookup_dev(struct cache_device *c, struct bio *bio)
+{
+ int dev;
+ for (dev = 0; dev < UUIDS_PER_SB; dev++)
+ if (c->devices[dev] == bio->bi_bdev->bd_cache_identifier)
+ break;
+
+ if (dev == UUIDS_PER_SB)
+ printk(KERN_DEBUG "bcache: unknown device %i",
+ bio->bi_bdev->bd_cache_identifier);
+
+ return dev;
+}
+
+static void run_search(struct work_struct *w)
+{
+ struct search_context *s = container_of(w, struct search_context, w);
+ s->end_fn(s->q, s->bio, s);
+}
+
+static void put_search(struct search_context *s)
+{
+ if (atomic_dec_and_test(&s->remaining)) {
+ smp_rmb();
+ atomic_set(&s->remaining, 1);
+ if (!s->end_fn)
+ kzfree(s);
+ else
+ BUG_ON(!queue_work(delayed, &s->w));
+ }
+}
+
+#define return_f(s, f) \
+ do { \
+ if (!object_is_on_stack(s)) { \
+ s->end_fn = f; \
+ smp_wmb(); \
+ put_search(s); \
+ } \
+ return; \
+ } while (0)
+
+#define add_wait_list(s, l) do { \
+ s = alloc_search(s); \
+ atomic_inc(&(s)->remaining); \
+ do \
+ (s)->next = l; \
+ while (cmpxchg(&(l), (s)->next, s) != (s)->next); \
+} while (0)
+
+#define run_wait_list(s) do { \
+ struct search_context *t = s; \
+ if (!t) \
+ break; \
+ s = t->next; \
+ put_search(t); \
+} while (1)
+
+static struct search_context *alloc_search(struct search_context *s)
+{
+ struct search_context *r = s;
+ if (object_is_on_stack(s)) {
+ r = kzalloc(sizeof(*s), GFP_NOIO);
+ *r = *s;
+ atomic_set(&r->remaining, 1);
+ INIT_WORK(&r->w, run_search);
+ }
+ return r;
+}
+
+static void __inc_bucket_gen(struct cache_device *c, long b)
+{
+
+ c->need_gc = max_t(uint8_t, c->need_gc,
+ ++c->buckets[b].generation - c->buckets[b].last_gc);
+
+ if (c->need_gc > 64 &&
+ !c->gc_in_progress) {
+ long i;
+ uint8_t *gc_in_progress;
+
+ spin_unlock(&c->bucket_lock);
+ gc_in_progress = kmalloc(c->sb.nbuckets, GFP_NOIO);
+ spin_lock(&c->bucket_lock);
+
+ if (!gc_in_progress)
+ return;
+
+ if (c->gc_in_progress) {
+ kfree(gc_in_progress);
+ return;
+ }
+
+ c->gc_in_progress = gc_in_progress;
+
+ for (i = 0; i < c->sb.nbuckets; i++)
+ c->gc_in_progress[i] = c->buckets[i].generation;
+
+ INIT_WORK(&c->work, do_btree_gc);
+ queue_work(delayed, &c->work);
+ }
+}
+
+static inline void inc_bucket_gen(struct cache_device *c, long b)
+{
+ spin_lock(&c->bucket_lock);
+ __inc_bucket_gen(c, b);
+ spin_unlock(&c->bucket_lock);
+}
+
+#define inc_gen(c, o) inc_bucket_gen(c, sector_to_bucket(o))
+
+static void __rescale_heap(struct cache_device *c, int sectors)
+{
+ long i;
+ c->rescale -= sectors;
+ if (c->rescale <= 0) {
+ for (i = 0; i < c->heap_size; i++) {
+ BUG_ON(c->heap[i]->priority == (uint16_t) ~0);
+ c->heap[i]->priority =
+ ((long) c->heap[i]->priority * 7) / 8;
+ }
+ c->rescale += rescale;
+
+ save_priorities(c);
+ }
+}
+
+static void rescale_heap(struct cache_device *c, int sectors)
+{
+ spin_lock(&c->bucket_lock);
+ __rescale_heap(c, sectors);
+ spin_unlock(&c->bucket_lock);
+}
+
+#define heap_cmp(i, j) (c->heap[i]->priority >= c->heap[j]->priority)
+
+static int heap_swap(struct cache_device *c, long i, long j)
+{
+ if (heap_cmp(i, j))
+ return 1;
+
+ swap(c->heap[i], c->heap[j]);
+
+ c->heap[i]->heap = i;
+ c->heap[j]->heap = j;
+ return 0;
+}
+
+static void __heap_sift(struct cache_device *c, long h)
+{
+ long r;
+
+ for (; h * 2 + 1 < c->heap_size; h = r) {
+ r = h * 2 + 1;
+ if (r + 1 < c->heap_size &&
+ heap_cmp(r, r + 1))
+ r++;
+
+ if (heap_swap(c, r, h))
+ break;
+ }
+}
+
+static void __heap_insert(struct cache_device *c, long b)
+{
+ if (c->buckets[b].heap == -1) {
+ long p, h = c->heap_size++;
+
+ BUG_ON(c->buckets[b].priority == (uint16_t) ~0);
+ c->buckets[b].priority = initial_priority;
+ c->buckets[b].heap = h;
+ c->heap[h] = &c->buckets[b];
+
+ for (p = (h - 1) / 2; p; h = p, p = (h - 1) / 2)
+ if (heap_swap(c, h, p))
+ break;
+
+ __heap_sift(c, h);
+
+ pr_debug("inserted bucket %li, new heap size %li, heap location %li",
+ b, c->heap_size, c->buckets[b].heap);
+ } else
+ __heap_sift(c, c->buckets[b].heap);
+}
+
+static void heap_insert(struct cache_device *c, long b)
+{
+ spin_lock(&c->bucket_lock);
+ __heap_insert(c, b);
+ spin_unlock(&c->bucket_lock);
+}
+
+static long __heap_pop(struct cache_device *c)
+{
+ long ret = c->heap[0] - c->buckets;
+
+ if (!c->heap_size) {
+ WARN(!c->heap_size, "bcache: empty heap!");
+ return -1;
+ }
+
+ __inc_bucket_gen(c, ret);
+ smp_mb();
+ if (atomic_read(&c->heap[0]->pin))
+ return -1;
+
+ heap_swap(c, 0, --c->heap_size);
+
+ __heap_sift(c, 0);
+ c->heap[c->heap_size] = NULL;
+
+ pr_debug("priority %i sector %lu", c->buckets[ret].priority,
+ bucket_to_sector(ret));
+
+ c->buckets[ret].priority = 0;
+ c->buckets[ret].heap = -1;
+ return ret;
+}
+
+static long __pop_bucket(struct cache_device *c)
+{
+ long r;
+ while (!fifo_full(&c->free)) {
+ r = __heap_pop(c);
+
+ if (r == -1)
+ break;
+
+ fifo_push(&c->free, r);
+
+ if (blk_queue_discard(bdev_get_queue(c->bdev))) {
+ spin_unlock(&c->bucket_lock);
+ blkdev_issue_discard(c->bdev, bucket_to_sector(r),
+ c->sb.bucket_size, GFP_NOIO, 0);
+ spin_lock(&c->bucket_lock);
+ }
+ }
+
+ if (!fifo_pop(&c->free, r))
+ r = -1;
+
+ return r;
+}
+
+static bool pop_bucket(struct cache_device *c)
+{
+ long b = __pop_bucket(c);
+
+ if (c->current_bucket)
+ __heap_insert(c, sector_to_bucket(c->current_bucket));
+
+ if (b == -1) {
+ c->sectors_free = 0;
+ c->current_bucket = 0;
+ return false;
+ }
+
+ c->current_bucket = bucket_to_sector(b);
+ c->sectors_free = c->sb.bucket_size;
+ return true;
+}
+
+static void free_bucket_contents(struct cache_device *c,
+ struct cached_bucket *b)
+{
+ int i;
+ /* XXX: check for dirty pages */
+ for (i = 0; i < pages_per_btree; i++)
+ if (b->pages[i]) {
+ kunmap(b->pages[i]);
+ put_page(b->pages[i]);
+ b->pages[i] = NULL;
+ }
+
+ /*struct address_space *mapping = p[0]->mapping;
+
+ spin_lock_irq(&mapping->tree_lock);
+ for (i = 0; i < pages; i++)
+ __remove_from_page_cache(p[i]);
+ spin_unlock_irq(&mapping->tree_lock);*/
+}
+
+static void fill_bucket_endio(struct bio *bio, int error)
+{
+ int i, n;
+ struct cached_bucket *b = bio->bi_private;
+ struct cache_device *c = b->c;
+
+ for (i = 0; i < bio->bi_vcnt; i++)
+ unlock_page(bio->bi_io_vec[i].bv_page);
+
+ do {
+ n = atomic_read(&b->nread);
+ for (i = n; i < pages_per_btree; i++)
+ if (PageLocked(b->pages[i]))
+ break;
+ } while (atomic_cmpxchg(&b->nread, n, i) != i);
+
+ if (i == pages_per_btree)
+ run_wait_list(b->wait);
+
+ bio_put(bio);
+}
+
+static int fill_bucket(struct cache_device *c, struct cached_bucket *b,
+ struct search_context **s)
+{
+ int i, nread, ret = 0;
+ struct bio *bio = NULL;
+
+ nread = find_get_pages(c->bdev->bd_inode->i_mapping,
+ (b->offset >> (PAGE_SHIFT - 9)),
+ pages_per_btree, b->pages);
+
+ for (i = 0; i < pages_per_btree; i++)
+ if (!b->pages[i]) {
+ b->pages[i] = __page_cache_alloc(GFP_NOIO);
+ b->pages[i]->mapping = c->bdev->bd_inode->i_mapping;
+ if (add_to_page_cache_lru(b->pages[i],
+ c->bdev->bd_inode->i_mapping,
+ (b->offset >> (PAGE_SHIFT - 9)) + i,
+ GFP_NOIO & GFP_RECLAIM_MASK)) {
+ /* XXX: need to check for actual errors */
+ page_cache_release(b->pages[i]);
+ b->pages[i] = find_get_page(c->bdev->bd_inode->i_mapping,
+ (b->offset >> (PAGE_SHIFT - 9)) + i);
+ BUG_ON(!b->pages[i]);
+ goto wait;
+ }
+
+ if (!bio) {
+ bio = bio_kmalloc(GFP_NOIO, pages_per_btree - nread);
+ bio->bi_bdev = c->bdev;
+ bio->bi_sector = b->offset + (i << (PAGE_SHIFT - 9));
+
+ bio->bi_private = b;
+ bio->bi_end_io = fill_bucket_endio;
+ atomic_inc(&c->buckets[sector_to_bucket(b->offset)].pin); /* wrong */
+
+ pr_debug("reading at sector %li, page_index %li",
+ bio->bi_sector, page_index(b->pages[i]));
+ }
+ nread++;
+
+ bio->bi_io_vec[bio->bi_vcnt].bv_page = b->pages[i];
+ bio->bi_io_vec[bio->bi_vcnt].bv_len = PAGE_SIZE;
+ bio->bi_io_vec[bio->bi_vcnt].bv_offset = 0;
+ bio->bi_vcnt++;
+ bio->bi_size += PAGE_SIZE;
+ data(b)[i] = kmap(b->pages[i]);
+ } else {
+wait: wait_on_page_locked(b->pages[i]);
+
+ if (bio)
+ submit_bio(READ, bio);
+ bio = NULL;
+ if (i == ret)
+ ret++;
+ data(b)[i] = kmap(b->pages[i]);
+ }
+
+ if (bio)
+ submit_bio(READ, bio);
+
+ return ret;
+}
+
+static struct cached_bucket *get_bucket(struct cache_device *c, sector_t offset,
+ int level, bool write,
+ struct search_context **s)
+{
+ bool f = false;
+ int i;
+ struct cached_bucket *b, *n = NULL;
+retry:
+ if (sector_to_priority(offset) != (uint16_t) ~0)
+ return NULL;
+
+ i = 0;
+ spin_lock(&c->bucket_lock);
+ list_for_each_entry(b, &c->lru, lru) {
+ if (offset == b->offset) {
+ list_move(&b->lru, &c->lru);
+ spin_unlock(&c->bucket_lock);
+
+ rw_lock(write, &b->lock, level);
+
+ if (offset == b->offset)
+ goto out;
+
+ rw_unlock(write, &b->lock);
+ goto retry;
+ }
+ i++;
+ }
+
+ b = list_entry(c->lru.prev, struct cached_bucket, lru);
+ if (i >= c->btree_buckets_cached && down_write_trylock(&b->lock)) {
+ list_del(&b->lru);
+
+ if (!write)
+ downgrade_write(&b->lock);
+ f = true;
+ } else if (n) {
+ b = n;
+ n = NULL;
+ b->c = c;
+ init_rwsem(&b->lock);
+ BUG_ON(write /* lockdep */
+ ? !down_write_trylock(&b->lock)
+ : !down_read_trylock(&b->lock));
+ } else {
+ spin_unlock(&c->bucket_lock);
+ n = kzalloc(sizeof(*n) + sizeof(void *) * pages_per_btree * 2,
+ GFP_NOIO);
+ if (!n)
+ return NULL;
+
+ goto retry;
+ }
+
+ i = atomic_read(&b->nread);
+ atomic_set(&b->nread, 0);
+ b->offset = offset;
+ b->level = level;
+ list_add(&b->lru, &c->lru);
+ spin_unlock(&c->bucket_lock);
+
+ if (f) {
+ __btree_write_node(c, b, 0, i);
+ free_bucket_contents(c, b);
+ }
+
+ i = fill_bucket(c, b, s);
+ atomic_set(&b->nread, i);
+out:
+ if (i == pages_per_btree)
+ run_wait_list(b->wait);
+ else
+ add_wait_list(*s, b->wait);
+
+ kzfree(n);
+ if (sector_to_priority(offset) == (uint16_t) ~0)
+ return b;
+
+ rw_unlock(write, &b->lock);
+ pr_debug("bucket %lu has been freed, gen %i",
+ offset, sector_to_gen(offset));
+ inc_gen(c, offset);
+ return NULL;
+}
+
+static struct cached_bucket *upgrade_bucket(struct cache_device *c,
+ struct cached_bucket *b,
+ struct search_context **s)
+{
+ int level = b->level;
+ sector_t offset = b->offset;
+
+ rw_unlock(false, &b->lock);
+ rw_lock(true, &b->lock, level);
+
+ if (b->offset != offset) {
+ rw_unlock(true, &b->lock);
+ return get_bucket(c, offset, level, true, s);
+ }
+
+ if (sector_to_priority(b->offset) != (uint16_t) ~0) {
+ rw_unlock(true, &b->lock);
+ return NULL;
+ }
+ return b;
+}
+
+#define upgrade_or_retry(c, b, write, s) ({ \
+ if (!write) \
+ b = upgrade_bucket(c, b, s); \
+ write = true; \
+ if (!b) \
+ goto retry; \
+ b; })
+
+static struct cached_bucket *btree_alloc(struct cache_device *c, int level,
+ struct btree_key *old[],
+ int nkeys, int skip, bool lru)
+{
+ long i = 0;
+ struct btree_node_header *h;
+ struct cached_bucket *b;
+ const char *err;
+
+ spin_lock(&c->bucket_lock);
+ list_for_each_entry(b, &c->lru, lru)
+ i++;
+
+ b = list_entry(c->lru.prev, struct cached_bucket, lru);
+ if (i >= c->btree_buckets_cached && down_write_trylock(&b->lock)) {
+ list_del(&b->lru);
+ free_bucket_contents(c, b);
+ /* parallel to get_bucket, need to __write */
+
+ b->offset = 0;
+ spin_unlock(&c->bucket_lock);
+ } else {
+ spin_unlock(&c->bucket_lock);
+ err = "bcache: btree_alloc: no mem allocating bucket";
+ b = kzalloc(sizeof(*b) + sizeof(void *) * pages_per_btree * 2,
+ GFP_NOIO);
+ if (!b)
+ goto err;
+
+ init_rwsem(&b->lock);
+ BUG_ON(!down_write_trylock(&b->lock)); /* lockdep */
+ }
+ b->c = c;
+ b->level = level;
+ atomic_set(&b->nread, pages_per_btree);
+
+ err = "bcache: btree_alloc: unable to alloc bucket";
+
+ spin_lock(&c->bucket_lock);
+ i = __pop_bucket(c);
+ if (i == -1) {
+ spin_unlock(&c->bucket_lock);
+ goto err;
+ }
+ c->buckets[i].priority = ~0;
+ b->offset = bucket_to_sector(i);
+ spin_unlock(&c->bucket_lock);
+
+ for (i = 0; i < pages_per_btree; i++) {
+ b->pages[i] = find_or_create_page(c->bdev->bd_inode->i_mapping,
+ (b->offset >> (PAGE_SHIFT - 9)) + i,
+ GFP_NOIO);
+ err = "bcache: btree_alloc: error adding new pages";
+ if (!b->pages[i]) {
+ free_bucket_contents(c, b);
+ goto err;
+ }
+
+ unlock_page(b->pages[i]);
+ b->pages[i + pages_per_btree] = kmap(b->pages[i]);
+ }
+
+ h = header(b);
+ get_random_bytes(&h->random, sizeof(uint64_t));
+ h->nkeys = nkeys - skip;
+
+ if (old)
+ for (i = 1; i <= h->nkeys; i++)
+ *node(data(b), i) = *node(old, i + skip);
+
+ for (i = 0; i < h->nkeys / keys_per_page + 1; i++)
+ SetPageDirty(b->pages[i]);
+
+ if (lru) {
+ spin_lock(&c->bucket_lock);
+ list_add(&b->lru, &c->lru);
+ spin_unlock(&c->bucket_lock);
+ }
+
+ pr_debug("sector %lu", b->offset);
+
+ return b;
+err:
+ if (b) {
+ /* Just let it be reused */
+ b->offset = 0;
+ up_write(&b->lock);
+ spin_lock(&c->bucket_lock);
+ list_add(&b->lru, &c->lru);
+ spin_unlock(&c->bucket_lock);
+ }
+ printk(KERN_WARNING "%s", err);
+ return NULL;
+}
+
+static void btree_free(struct cache_device *c, struct cached_bucket *b)
+{
+ long n = sector_to_bucket(b->offset);
+ BUG_ON(n < 0 || n > c->sb.nbuckets);
+ BUG_ON(b == c->root);
+
+ spin_lock(&c->bucket_lock);
+
+ __inc_bucket_gen(c, n);
+ c->buckets[n].priority = 0;
+
+ if (!fifo_push(&c->free, n))
+ __heap_insert(c, n);
+
+ spin_unlock(&c->bucket_lock);
+
+ /*
+ * Need to check if b->nread != pages_per_btree...
+ */
+
+ if (c->sb.btree_level == b->level) {
+ spin_lock(&c->bucket_lock);
+ list_add(&b->lru, &c->lru);
+ spin_unlock(&c->bucket_lock);
+ }
+
+ blkdev_issue_discard(c->bdev, b->offset,
+ c->sb.bucket_size, GFP_NOIO, 0);
+
+ pr_debug("bucket %li, sector %lu", n, b->offset);
+}
+
+static void set_new_root(struct cache_device *c, struct cached_bucket *b)
+{
+ BUG_ON(sector_to_priority(b->offset) != (uint16_t) ~0);
+ spin_lock(&c->sb_lock);
+ c->sb.btree_level = b->level;
+ c->sb.btree_root = b->offset;
+ c->root = b;
+ write_super(c);
+ spin_unlock(&c->sb_lock);
+ pr_debug("new root %lli", c->sb.btree_root);
+}
+
+static void btree_write_node_endio(struct bio *bio, int error)
+{
+ int i;
+ for (i = 0; i > bio->bi_vcnt; i++)
+ put_page(bio->bi_io_vec[i].bv_page);
+
+ bio_put(bio);
+}
+
+static void __btree_write_node(struct cache_device *c, struct cached_bucket *b,
+ int skip, int n)
+{
+ int i;
+ struct bio *bio = NULL;
+
+ BUG_ON(n > pages_per_btree);
+
+ for (i = skip; i < n; i++) {
+ BUG_ON(!b->pages[i]);
+
+ if (!PageDirty(b->pages[i])) {
+ if (bio) {
+ submit_bio(WRITE, bio);
+ bio = NULL;
+ }
+ continue;
+ }
+
+ if (!bio) {
+ bio = bio_kmalloc(GFP_NOIO, n - i);
+ if (!bio) {
+ pr_debug("couldn't allocate bio!");
+ return;
+ }
+
+ bio->bi_bdev = c->bdev;
+ bio->bi_sector = page_index(b->pages[i])
+ << (PAGE_SHIFT - 9);
+ bio->bi_end_io = btree_write_node_endio;
+ }
+
+ bio->bi_io_vec[bio->bi_vcnt].bv_page = b->pages[i];
+ bio->bi_io_vec[bio->bi_vcnt].bv_len = PAGE_SIZE;
+ bio->bi_io_vec[bio->bi_vcnt].bv_offset = 0;
+
+ bio->bi_size += PAGE_SIZE;
+ bio->bi_vcnt++;
+
+ get_page(b->pages[i]);
+ ClearPageDirty(b->pages[i]);
+ }
+
+ pr_debug("sector %li pages %i", bio->bi_sector, n);
+ if (bio)
+ submit_bio(WRITE, bio);
+}
+
+static void btree_write_node(struct cache_device *c, struct cached_bucket *b,
+ int skip)
+{
+ int n = keys(&data(b)[skip]) / keys_per_page + 1;
+
+ if (((keys(&data(b)[skip]) + 1) % keys_per_page) == 0 &&
+ PageDirty(b->pages[skip]))
+ __btree_write_node(c, b, skip, n + skip);
+}
+
+static struct bio *bio_split_front(struct bio *bio, sector_t sectors)
+{
+ int idx = 0, nbytes = sectors << 9;
+ struct bio_vec *bv;
+ struct bio *ret = NULL;
+
+ if (sectors >= bio_sectors(bio)) {
+ bio->bi_vcnt -= bio->bi_idx;
+ bio->bi_max_vecs -= bio->bi_idx;
+ bio->bi_io_vec += bio->bi_idx;
+ bio->bi_idx = 0;
+ return bio;
+ }
+ pr_debug("splitting");
+
+ bio_for_each_segment(bv, bio, idx) {
+ if (!nbytes) {
+ if (!(ret = bio_kmalloc(GFP_NOIO, 0)))
+ break;
+
+ ret->bi_vcnt = idx - bio->bi_idx;
+ ret->bi_io_vec = &bio->bi_io_vec[bio->bi_idx];
+ break;
+ } else if (nbytes < bv->bv_len) {
+ int vcnt = idx - bio->bi_idx + 1;
+ if (!(ret = bio_kmalloc(GFP_NOIO, vcnt)))
+ break;
+
+ ret->bi_vcnt = vcnt;
+ memcpy(ret->bi_io_vec,
+ &bio->bi_io_vec[bio->bi_idx],
+ sizeof(struct bio_vec) * vcnt);
+
+ ret->bi_io_vec[vcnt-1].bv_len = nbytes;
+ bv->bv_offset += nbytes;
+ break;
+ }
+
+ nbytes -= bv->bv_len;
+ }
+
+ if (ret) {
+ ret->bi_sector = bio->bi_sector;
+ ret->bi_size = sectors << 9;
+ ret->bi_bdev = bio->bi_bdev;
+ ret->bi_flags |= 1 << BIO_SPLIT;
+ ret->bi_rw = bio->bi_rw;
+ ret->bi_size = bio->bi_size;
+ ret->bi_idx = 0;
+
+ bio->bi_sector += sectors;
+ bio->bi_size -= sectors << 9;
+ bio->bi_idx += idx;
+
+ ret->bi_private = bio;
+ ret->bi_end_io = NULL;
+ atomic_inc(&bio->bi_remaining);
+ } else
+ pr_debug("failed");
+
+ return ret;
+}
+
+static void bio_add_work(struct bio *bio, int error)
+{
+ struct search_context *s = bio->bi_private;
+
+ s->error = error;
+ bio_put(bio);
+ put_search(s);
+}
+
+static void cache_hit(struct cache_device *c, struct bio *list)
+{
+ long b;
+ struct bio *bio;
+ if (!list)
+ return;
+
+ spin_lock(&c->bucket_lock);
+ for (bio = list; bio; bio = bio->bi_next) {
+ bio->bi_bdev = c->bdev;
+
+ b = sector_to_bucket(bio->bi_sector);
+ BUG_ON(c->buckets[b].priority == (uint16_t) ~0);
+ c->buckets[b].priority = (long) initial_priority;
+ /* * (cache_hit_seek + cache_hit_priority
+ * bio_sectors(bio) / c->sb.bucket_size)
+ / (cache_hit_seek + cache_hit_priority);*/
+
+ __heap_insert(c, b);
+
+ __rescale_heap(c, bio_sectors(bio));
+ c->cache_hits++;
+ cache_hits++;
+ }
+ spin_unlock(&c->bucket_lock);
+
+ while (list) {
+ sector_t s = list->bi_sector;
+ bio = list;
+ list = bio->bi_next;
+ bio->bi_next = NULL;
+
+ __generic_make_request(bio);
+ atomic_dec(&c->buckets[sector_to_bucket(s)].pin);
+ }
+}
+
+static bool __ptr_bad(struct cache_device *c, uint64_t p)
+{
+ sector_t o = PTR_OFFSET(p), l = PTR_LENGTH(p);
+ if (sector_to_bucket(o) < 0 ||
+ sector_to_bucket(o) >= c->sb.nbuckets ||
+ l + o % c->sb.bucket_size > c->sb.bucket_size) {
+ pr_debug("bad ptr %llu: offset %lu len %lu", p, o, l);
+ return true;
+ }
+ return PTR_GEN(p) != sector_to_gen(PTR_OFFSET(p));
+}
+
+#define ptr_bad(c, p) __ptr_bad(c, p->ptr)
+
+#define run_on_root(write, f, ...) ({ \
+ int _r = -2; \
+ do { \
+ bool _w = (write); \
+ rw_lock(_w, &c->root->lock, c->root->level); \
+ if (sector_to_priority(c->root->offset) == (uint16_t) ~0\
+ && _w == (write)) \
+ _r = f(c, c->root, __VA_ARGS__); \
+ else { \
+ rw_unlock(_w, &c->root->lock); \
+ cpu_relax(); \
+ } \
+ } while (_r == -2); \
+ _r; })
+
+#define sorted_set_checks(i, b) ({ \
+ bool _cont = true; \
+ if (index(i, b) >= nread) \
+ goto again; \
+ if (rand(i) != header(b)->random) \
+ _cont = false; \
+ else if (keys(i) >= (pages_per_btree - index(i, b)) * keys_per_page) {\
+ pr_debug("bad btree header: page %li h->nkeys %i", \
+ index(i, b), keys(i)); \
+ keys(i) = 0; \
+ if (i != data(b)) \
+ _cont = false; \
+ } else if (keys(i) >= (nread - index(i, b)) * keys_per_page) \
+ goto again; \
+ _cont; })
+
+/* Iterate over the sorted sets of pages
+ */
+#define for_each_sorted_set(i, b) \
+ for (i = data(b), nread = atomic_read(&b->nread); \
+ index(i, b) < pages_per_btree && sorted_set_checks(i, b); \
+ i += (keys(i) / keys_per_page) + 1)
+
+#define for_each_key(i, j, b) \
+ for_each_sorted_set(i, b) \
+ for (j = 1; j <= keys(i); j++)
+
+/*
+ * Returns the smallest key greater than the search key.
+ * This is because we index by the end, not the beginning
+ */
+static int btree_bsearch(struct btree_key *data[], uint64_t search)
+{
+ int l = 1, r = keys(data) + 1;
+
+ while (l < r) {
+ int m = (l + r) >> 1;
+ if (node(data, m)->key > search)
+ r = m;
+ else
+ l = m + 1;
+ }
+
+ return l;
+}
+
+static int btree_search(struct cache_device *c, struct cached_bucket *b,
+ int device, struct bio *bio, struct search_context **s)
+{
+ int ret = -1, j, last, nread;
+ struct btree_key **i;
+ struct bio *split;
+ struct cached_bucket *recurse;
+
+ uint64_t search = TREE_KEY(device, bio->bi_sector);
+
+ pr_debug("at %lu searching for %llu, nread %i",
+ b->offset, search, atomic_read(&b->nread));
+
+ for_each_sorted_set(i, b)
+ for (j = btree_bsearch(i, search), last = 0;
+ j <= keys(i) && ret < 1;
+ j++) {
+ if (ptr_bad(c, node(i, j)))
+ continue;
+
+ BUG_ON(node(i, j)->key <= search);
+
+ if (device != TREE_KEY_DEV(i, j) ||
+ (last && search + bio_sectors(bio) <= node(i, last)->key))
+ break;
+
+ last = j;
+
+ pr_debug("loop %i of %i page %li level %i key %llu ptr %llu", j,
+ keys(i), i - data(b), b->level, node(i, j)->key, node(i, j)->ptr);
+
+ if (b->level) {
+ recurse = get_bucket(c, TREE_PTR_OFFSET(i, j), b->level - 1, false, s);
+ if (recurse)
+ ret = max(ret, btree_search(c, recurse, device, bio, s));
+ } else
+ if (search >= node(i, j)->key - TREE_PTR_LENGTH(i, j)) {
+ long bucket = sector_to_bucket(TREE_PTR_OFFSET(i, j));
+ atomic_inc(&c->buckets[bucket].pin);
+ smp_mb__after_atomic_inc();
+ if (sector_to_gen(TREE_PTR_OFFSET(i, j)) !=
+ TREE_PTR_GEN(i, j)) {
+ atomic_dec(&c->buckets[bucket].pin);
+ continue;
+ }
+
+ split = bio_split_front(bio, node(i, j)->key - search);
+ if (!split)
+ goto err;
+
+ split->bi_sector = TREE_PTR_OFFSET(i, j) + (split->bi_sector -
+ (TREE_KEY_OFFSET(i, j) - TREE_PTR_LENGTH(i, j)));
+
+
+ if (split != bio) {
+ pr_debug("partial cache hit");
+ split->bi_next = bio->bi_next;
+ bio->bi_next = split;
+ } else
+ ret = 1;
+ }
+ search = TREE_KEY(device, bio->bi_sector);
+ }
+
+ label(err, ret = -1);
+ label(again, ret = 0);
+ rw_unlock(false, &b->lock);
+ return ret;
+}
+
+static void btree_sort(void *base, size_t num)
+{
+ size_t i;
+
+ void sift(size_t r, size_t n)
+ {
+ int c = r * 2;
+ for (; c <= n; r = c, c *= 2) {
+ if (c < n &&
+ node(base, c)->key < node(base, c + 1)->key)
+ c++;
+ if (node(base, r)->key >= node(base, c)->key)
+ return;
+ swap(*node(base, r), *node(base, c));
+ }
+ }
+
+ for (i = num / 2 + 1; i > 0; --i)
+ sift(i, num);
+
+ for (i = num; i > 1; sift(1, --i))
+ swap(*node(base, 1), *node(base, i));
+}
+
+static void btree_clean(struct cache_device *c, struct cached_bucket *b)
+{
+ size_t j, n, orig;
+ int nkeys;
+ struct btree_node_header *h = header(b);
+ struct btree_key **i;
+
+ orig = nkeys = h->nkeys;
+ for (j = 1; j <= nkeys; j++)
+ while (j <= nkeys && ptr_bad(c, node(data(b), j)))
+ if (j <= --nkeys)
+ *node(data(b), j) = *node(data(b), nkeys + 1);
+
+ for (h = header(b), i = data(b);
+ i < data(b) + pages_per_btree &&
+ h->random == header(b)->random;
+ i += (n / keys_per_page) + 1, h = (struct btree_node_header *) *i) {
+
+ if (h->nkeys >= (pages_per_btree - (i - data(b))) * keys_per_page) {
+ pr_debug("bad btree header: page %li h->nkeys %i",
+ i - data(b), h->nkeys);
+ h->nkeys = h->random = 0;
+ break;
+ }
+
+ n = h->nkeys;
+ if (data(b) == i)
+ continue;
+ orig += h->nkeys;
+
+ for (j = 1; j <= n; j++)
+ if (!ptr_bad(c, node(i, j)))
+ *node(data(b), ++nkeys) = *node(i, j);
+ }
+ h = header(b);
+ h->nkeys = nkeys;
+
+ pr_debug("merged %i keys from %lu keys", h->nkeys, orig);
+ btree_sort(data(b), h->nkeys);
+}
+
+static int btree_gc(struct cache_device *c, struct cached_bucket *b,
+ struct btree_key *root, struct search_context *s)
+{
+ int j, r, ret = 0, nread;
+ uint64_t rand;
+ bool write = false;
+ struct btree_key **i;
+ struct cached_bucket *n = NULL, *recurse;
+
+ for_each_key(i, j, b)
+ ret = max_t(uint8_t, ret,
+ sector_to_gen(TREE_PTR_OFFSET(i, j)) -
+ TREE_PTR_GEN(i, j));
+
+ if (ret > 10 && PageDirty(b->pages[0])) {
+ write = true;
+ b = upgrade_bucket(c, b, &s);
+ if (!b)
+ return 0;
+ }
+
+ if (ret > 10 && !write) {
+ n = btree_alloc(c, b->level, NULL, 0,
+ 0, c->sb.btree_level != b->level);
+ if (n) {
+ rand = header(n)->random;
+ for (j = 0; j < pages_per_btree; j++)
+ memcpy(data(n)[j], data(b)[j], PAGE_SIZE);
+ header(n)->random = rand;
+ swap(b, n);
+ write = true;
+ }
+ }
+
+ if (write) {
+ btree_clean(c, b);
+ *root = bucket_key(b);
+ ret = 0;
+ }
+
+ if (b->level)
+ for_each_key(i, j, b)
+ if (!ptr_bad(c, node(i, j)))
+ if ((recurse = get_bucket(c, TREE_PTR_OFFSET(i, j),
+ b->level - 1, false, &s))) {
+ struct btree_key k = *node(i, j);
+ r = btree_gc(c, recurse, write ? node(i, j) : &k, s);
+ if (r < 0)
+ goto again;
+ ret = max_t(uint8_t, ret, r);
+ }
+
+ btree_write_node(c, b, 0);
+
+ label(again, ret = -1);
+ rw_unlock(write, &b->lock);
+
+ if (n) {
+ if (c->sb.btree_level == b->level)
+ set_new_root(c, b);
+
+ btree_free(c, n);
+ rw_unlock(false, &n->lock);
+ }
+
+ pr_debug("ret %i", ret);
+ return ret;
+}
+
+static void do_btree_gc(struct work_struct *w)
+{
+ uint8_t *t;
+ long i, r;
+ struct btree_key root;
+ struct cache_device *c = container_of(w, struct cache_device, work);
+ struct search_context s = { .q = c, .end_fn = NULL };
+
+ pr_debug("collecting garbage, need_gc %i", c->need_gc);
+
+ down_write(&c->gc_lock);
+ r = run_on_root(false, btree_gc, &root, &s);
+ up_write(&c->gc_lock);
+
+ spin_lock(&c->bucket_lock);
+
+ if (r >= 0) {
+ c->need_gc = r;
+
+ for (i = 0; i < c->sb.nbuckets; i++) {
+ c->buckets[i].last_gc = c->gc_in_progress[i];
+ c->need_gc = max_t(uint8_t, c->need_gc,
+ c->buckets[i].generation - c->buckets[i].last_gc);
+ }
+
+ pr_debug("garbage collect done, new need_gc %i", c->need_gc);
+ }
+
+ t = c->gc_in_progress;
+ c->gc_in_progress = NULL;
+ spin_unlock(&c->bucket_lock);
+
+ kfree(t);
+}
+
+static void btree_insert_one_key(struct cache_device *c, struct btree_key *i[],
+ struct btree_key *k)
+{
+ int j;
+ size_t m, n;
+ BUG_ON(!PageDirty(virt_to_page(*i)));
+
+ n = m = btree_bsearch(i, k->key);
+
+ if (m > 1) {
+ if (TREE_PTR_OFFSET(i, m - 1) == PTR_OFFSET(k->ptr) &&
+ !PTR_LENGTH(k->ptr)) {
+ /* Replacing a stale pointer to a btree bucket */
+ m--;
+ k->ptr--;
+ spin_lock(&c->bucket_lock);
+ c->buckets[sector_to_bucket(PTR_OFFSET(k->ptr))].generation--;
+ spin_unlock(&c->bucket_lock);
+ }
+#if 0
+ else if (k->key - PTR_LENGTH(k->ptr) <=
+ node(i, m - 1)->key - TREE_PTR_LENGTH(i, m - 1))
+ /* Replacing a stale key */
+ m--;
+ else if (k->key - PTR_LENGTH(k->ptr) < node(i, m - 1)->key) {
+ /* Key partially overwrites an existing key */
+ node(i, m - 1)->key -= k->key - node(i, m - 1)->key;
+ node(i, m - 1)->ptr -= PTR_LENGTH(k->key - node(i, m - 1)->key);
+ }
+#endif
+ }
+
+ pr_debug("%s at %lu h->nkeys %i key %llu ptr %llu",
+ m == n ? "inserting" : "replacing",
+ m, keys(i), k->key, k->ptr);
+
+ if (m == n) {
+ for (j = keys(i)++; j >= m; --j)
+ *node(i, j+1) = *node(i, j);
+
+ /* necessary for btree_split */
+ if (!(keys(i) & keys_per_page))
+ SetPageDirty(virt_to_page(i[keys(i) / keys_per_page]));
+ }
+
+ *node(i, m) = *k;
+}
+
+static int btree_split(struct cache_device *c, struct cached_bucket *b,
+ struct btree_key *new_keys, int *n)
+{
+ int ret = 0;
+ struct cached_bucket *n1, *n2 = NULL, *n3 = NULL;
+ struct btree_node_header *h;
+
+ h = header(b);
+ pr_debug("splitting at level %i sector %lu nkeys %i",
+ b->level, b->offset, h->nkeys);
+ btree_clean(c, b);
+
+ if (h->nkeys < keys_per_page * pages_per_btree / 2) {
+ bool lru = c->sb.btree_level != b->level;
+ pr_debug("not splitting: %i keys", h->nkeys);
+
+ while (*n)
+ btree_insert_one_key(c, data(b), &new_keys[--(*n)]);
+
+ if (!(n1 = btree_alloc(c, b->level, data(b), h->nkeys, 0, lru)))
+ goto err;
+
+ btree_write_node(c, n1, 0);
+
+ if (lru)
+ new_keys[(*n)++] = bucket_key(n1);
+ else
+ set_new_root(c, n1);
+ goto out;
+ }
+
+ if (!(n1 = btree_alloc(c, b->level, data(b), h->nkeys >> 1, 0, true)) ||
+ !(n2 = btree_alloc(c, b->level, data(b), h->nkeys, h->nkeys >> 1, true)))
+ goto err;
+
+ while (*n)
+ if (new_keys[--(*n)].key <= last_key(data(n1)))
+ btree_insert_one_key(c, data(n1), &new_keys[*n]);
+ else
+ btree_insert_one_key(c, data(n2), &new_keys[*n]);
+
+ new_keys[(*n)++] = bucket_key(n2);
+ new_keys[(*n)++] = bucket_key(n1);
+
+ btree_write_node(c, n1, 0);
+ btree_write_node(c, n2, 0);
+
+ if (c->sb.btree_level == b->level) {
+ if (!(n3 = btree_alloc(c, b->level + 1, NULL, 0, 0, false)))
+ goto err;
+
+ while (*n)
+ btree_insert_one_key(c, data(n3), &new_keys[--(*n)]);
+ btree_write_node(c, n3, 0);
+
+ rw_unlock(true, &n3->lock);
+ set_new_root(c, n3);
+ }
+
+ rw_unlock(true, &n2->lock);
+out:
+ rw_unlock(true, &n1->lock);
+ btree_free(c, b);
+ return ret;
+err:
+ printk(KERN_WARNING "bcache: couldn't split");
+ if (n2) {
+ btree_free(c, n2);
+ rw_unlock(true, &n2->lock);
+ }
+ if (n1) {
+ btree_free(c, n1);
+ rw_unlock(true, &n1->lock);
+ }
+ btree_write_node(c, b, 0);
+ return 0;
+}
+
+static int btree_insert(struct cache_device *c, struct cached_bucket *b,
+ struct btree_key *new_keys, int *n,
+ struct search_context **s)
+{
+ int ret = 0, nread;
+ uint64_t biggest = 0;
+ struct btree_key **i;
+
+ while (*n) {
+ for_each_sorted_set(i, b) {
+ if (keys(i))
+ biggest = max(biggest, last_key(i));
+
+ if (PageDirty(b->pages[index(i, b)]))
+ break;
+ }
+
+ if (index(i, b) >= pages_per_btree)
+ return btree_split(c, b, new_keys, n);
+
+ if (rand(i) != header(b)->random) {
+ rand(i) = header(b)->random;
+ keys(i) = 0;
+ SetPageDirty(b->pages[index(i, b)]);
+ }
+
+ pr_debug("inserting %i keys %llu at %lu level %i page %li h->nkeys %i",
+ *n, new_keys[*n - 1].key, b->offset, b->level, index(i, b), keys(i));
+
+ while (*n && (keys(i) + 1) % keys_per_page) {
+ btree_insert_one_key(c, i, &new_keys[--(*n)]);
+
+ if (new_keys[*n].key > biggest) {
+ new_keys[0].key = new_keys[*n].key;
+ ret = 1;
+ }
+
+ biggest = max(new_keys[*n].key, biggest);
+ }
+
+ btree_write_node(c, b, index(i, b));
+ }
+
+ if (ret == 1 && b->level != c->sb.btree_level) {
+ inc_gen(c, b->offset);
+ new_keys[(*n)++].ptr = bucket_to_ptr(b);
+ }
+
+ label(again, ret = -1);
+ return ret;
+}
+
+static int btree_insert_recurse(struct cache_device *c, struct cached_bucket *b,
+ int *level, struct btree_key *new_keys, int *n,
+ struct search_context **s)
+{
+ int j, ret = 0, nread;
+ struct btree_key **i;
+ struct cached_bucket *r;
+ bool write = !(b->level - *level);
+
+ if (!atomic_read(&b->nread))
+ goto again;
+
+ if (!header(b)->random) {
+trashed: if (c->sb.btree_level != b->level) {
+ btree_free(c, b);
+ goto done;
+ }
+
+ printk(KERN_WARNING "bcache: btree was trashed, h->nkeys %i", header(b)->nkeys);
+ r = btree_alloc(c, 0, NULL, 0, 0, false);
+ set_new_root(c, r);
+
+ btree_free(c, b);
+ rw_unlock(write, &b->lock);
+
+ b = r;
+ write = true;
+ }
+
+ if (b->level - *level) {
+ struct btree_key recurse_key = { .key = 0, .ptr = 0 };
+
+ for_each_sorted_set(i, b) {
+ for (j = btree_bsearch(i, new_keys[0].key);
+ j <= keys(i) && ptr_bad(c, node(i, j));
+ j++)
+ ;
+
+ if (j > keys(i))
+ for (j = keys(i); j > 0 && ptr_bad(c, node(i, j)); j--)
+ ;
+
+ /* Pick the smallest key to recurse on that's bigger
+ * than the key we're inserting, or failing that,
+ * the biggest key.
+ */
+ if (j &&
+ ((node(i, j)->key > recurse_key.key && recurse_key.key < new_keys[0].key) ||
+ (node(i, j)->key < recurse_key.key && node(i, j)->key > new_keys[0].key)))
+ recurse_key = *node(i, j);
+ }
+
+ /* No key to recurse on */
+ if (!recurse_key.ptr)
+ goto trashed;
+
+ r = get_bucket(c, PTR_OFFSET(recurse_key.ptr),
+ b->level - 1, !(b->level - *level - 1), s);
+ if (!r)
+ goto retry;
+
+ ret = btree_insert_recurse(c, r, level, new_keys, n, s);
+ }
+
+ if (*n && ret >= 0) {
+ *level = b->level;
+ upgrade_or_retry(c, b, write, s);
+ ret = btree_insert(c, b, new_keys, n, s);
+ }
+done:
+ label(retry, ret = -2);
+ label(again, ret = -1);
+ rw_unlock(write, &b->lock);
+ return ret;
+}
+
+static void btree_insert_async(void *q, struct bio *bio,
+ struct search_context *s)
+{
+ struct cache_device *c = q;
+ int ret, keys = 1, level = 0;
+
+ down_read(&c->gc_lock);
+ ret = run_on_root(!(c->root->level - level),
+ btree_insert_recurse, &level, s->new_keys, &keys, &s);
+ up_read(&c->gc_lock);
+
+ return_f(s, ret == -1 ? btree_insert_async : NULL);
+}
+
+static void bio_insert(void *private, struct bio *bio,
+ struct search_context *s)
+{
+ int dev, idx;
+ struct cache_device *c = NULL, *i;
+ struct bio_vec *bv;
+ struct bio *n = NULL;
+
+ atomic_set(&bio->bi_remaining, 1);
+ bio->bi_bdev = c->bdev;
+ bio->bi_private = s->bi_private;
+ bio->bi_end_io = s->bi_end_io;
+ bio->bi_next = NULL;
+ bio->bi_idx = 0;
+ bio->bi_size = 0;
+ bio_for_each_segment(bv, bio, idx)
+ bio->bi_size += bv->bv_len;
+
+ if (!bio->bi_size || s->error || list_empty(&cache_devices))
+ goto err;
+
+ /*struct open_bucket *b;
+ list_for_each_entry(b, &open_buckets, list)
+ if (bio->bi_bdev->bd_cache_identifier == b->identifier &&
+ KEY_OFFSET(b->key.key) + (len >> 9) == s->bi_sector) {
+ c = b->cache;
+ c->last_used = jiffies;
+ list_move(&b->list, &open_buckets);
+ goto found;
+ }*/
+
+ list_for_each_entry(i, &cache_devices, list)
+ if (!c || c->last_used > i->last_used)
+ c = i;
+ c->last_used = jiffies;
+
+ dev = lookup_dev(c, bio);
+ if (dev == UUIDS_PER_SB)
+ goto err;
+
+ while (n != bio) {
+ struct search_context t = { .q = c, .new_keys[0] = { 0, 0 } };
+ sector_t split, offset;
+
+ spin_lock(&c->bucket_lock);
+ if (c->sectors_free < min_t(int, bio_sectors(bio), PAGE_SIZE >> 9)) {
+ if (!pop_bucket(c)) {
+ spin_unlock(&c->bucket_lock);
+ goto err;
+ }
+
+ t.new_keys[0] = c->current_key;
+ c->current_key = (struct btree_key) {0, 0};
+ }
+
+ split = min_t(int, bio_sectors(bio), c->sectors_free);
+
+ offset = c->current_bucket + c->sb.bucket_size - c->sectors_free;
+ c->sectors_free -= split;
+ s->bi_sector += split;
+ spin_unlock(&c->bucket_lock);
+
+ n = bio_split_front(bio, split);
+ if (!n)
+ goto err;
+
+ n->bi_sector = offset;
+
+ if (c->current_key.ptr &&
+ c->current_key.key + bio_sectors(n) == TREE_KEY(dev, s->bi_sector)) {
+ c->current_key.key += TREE_KEY(0, bio_sectors(n));
+ c->current_key.ptr += TREE_PTR(0, bio_sectors(n), 0);
+ } else {
+ if (c->current_key.ptr)
+ t.new_keys[0] = c->current_key;
+
+ if (t.new_keys[0].ptr) {
+ heap_insert(c, sector_to_bucket(c->current_bucket));
+ btree_insert_async(c, NULL, &t);
+ }
+
+ c->current_key.key = TREE_KEY(dev, s->bi_sector);
+ c->current_key.ptr = TREE_PTR(sector_to_gen(c->current_bucket),
+ bio_sectors(n), n->bi_sector);
+ }
+
+ pr_debug("adding to cache %u sectors at %lu key %llu",
+ bio_sectors(n), n->bi_sector, c->current_key.key);
+ submit_bio(WRITE, n);
+ }
+
+ /* refcounting problem when error and bio's been split?
+ */
+err:
+ return_f(s, NULL);
+}
+
+static void request_hook_read(void *p, struct bio *bio,
+ struct search_context *s)
+{
+ int ret = -1, dev;
+ struct request_queue *q = p;
+ struct cache_device *c;
+
+ if (list_empty(&cache_devices))
+ goto out;
+
+ list_for_each_entry(c, &cache_devices, list) {
+ dev = lookup_dev(c, bio);
+ if (dev == UUIDS_PER_SB)
+ goto out;
+
+ ret = max(ret, run_on_root(false, btree_search, dev, bio, &s));
+
+ if (ret == 1) {
+ cache_hit(c, bio);
+ return_f(s, NULL);
+ } else {
+ cache_hit(c, bio->bi_next);
+ bio->bi_next = NULL;
+ }
+ }
+
+ s = alloc_search(s);
+ s->bio = bio;
+ s->q = q;
+
+ if (!ret)
+ return_f(s, request_hook_read);
+
+ pr_debug("cache miss for %lu, starting write", bio->bi_sector);
+ cache_misses++;
+
+ list_for_each_entry(c, &cache_devices, list)
+ rescale_heap(c, bio_sectors(bio));
+
+ s->end_fn = bio_insert;
+ s->bi_end_io = bio->bi_end_io;
+ s->bi_private = bio->bi_private;
+ s->bi_sector = bio->bi_sector;
+
+ bio->bi_private = s;
+ bio->bi_end_io = bio_add_work;
+ bio_get(bio);
+
+out:
+ if (q->make_request_fn(q, bio))
+ generic_make_request(bio);
+}
+
+static void request_hook_write(struct request_queue *q, struct bio *bio,
+ struct search_context *s)
+{
+ if (q->make_request_fn(q, bio))
+ generic_make_request(bio);
+}
+
+static int request_hook(struct request_queue *q, struct bio *bio)
+{
+ struct search_context s;
+ memset(&s, 0, sizeof(s));
+ if (bio->bi_size) {
+ if (bio_rw_flagged(bio, BIO_RW))
+ request_hook_write(q, bio, &s);
+ else
+ request_hook_read(q, bio, &s);
+ return 0;
+ } else
+ return 1;
+}
+
+#define write_attribute(n) \
+ static struct attribute sysfs_##n = { .name = #n, .mode = S_IWUSR }
+#define read_attribute(n) \
+ static struct attribute sysfs_##n = { .name = #n, .mode = S_IRUSR }
+#define rw_attribute(n) \
+ static struct attribute sysfs_##n = \
+ { .name = #n, .mode = S_IWUSR|S_IRUSR }
+
+#define sysfs_print(file, ...) \
+ if (attr == &sysfs_ ## file) \
+ return snprintf(buffer, PAGE_SIZE, __VA_ARGS__)
+
+#define sysfs_atoi(file, var) \
+ if (attr == &sysfs_ ## file) { \
+ unsigned long _v, _r = strict_strtoul(buffer, 10, &_v); \
+ if (_r) \
+ return _r; \
+ var = _v; \
+ }
+
+write_attribute(register_cache);
+write_attribute(register_dev);
+write_attribute(unregister);
+
+read_attribute(bucket_size);
+read_attribute(nbuckets);
+read_attribute(cache_hits);
+read_attribute(cache_hit_ratio);
+read_attribute(cache_misses);
+read_attribute(tree_depth);
+read_attribute(min_priority);
+
+rw_attribute(cache_priority_initial);
+rw_attribute(cache_priority_hit);
+rw_attribute(cache_priority_seek);
+rw_attribute(cache_priority_rescale);
+
+DECLARE_WAIT_QUEUE_HEAD(pending);
+
+static void write_super_endio(struct bio *bio, int error)
+{
+ if (error)
+ pr_debug("io error writing superblock %i", error);
+ bio_put(bio);
+}
+
+static void load_priorites_endio(struct bio *bio, int error)
+{
+ atomic_t *wait = bio->bi_private;
+ if (error)
+ printk(KERN_ERR "bcache: Error reading priorities");
+ atomic_dec(wait);
+ wake_up(&pending);
+ bio_put(bio);
+}
+
+static void load_priorities(struct cache_device *c)
+{
+ atomic_t wait;
+ long i = 0, used = 0, vecs = bio_get_nr_vecs(c->bdev);
+ int n = (sizeof(struct bucket_disk) * c->sb.nbuckets) / PAGE_SIZE + 1;
+ struct bio *bio = bio_kmalloc(GFP_KERNEL, n);
+
+ if (!bio)
+ return;
+
+ bio->bi_sector = PRIO_SECTOR;
+ bio->bi_bdev = c->bdev;
+ bio->bi_vcnt = n;
+ bio->bi_size = PAGE_SIZE * n;
+
+ bio->bi_private = &wait;
+ bio->bi_end_io = load_priorites_endio;
+
+ for (i = 0; i < n; i++) {
+ bio->bi_io_vec[i].bv_page =
+ vmalloc_to_page((void *) c->disk_buckets
+ + PAGE_SIZE * i);
+ bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+ bio->bi_io_vec[i].bv_offset = 0;
+ }
+
+ while (bio->bi_vcnt > vecs)
+ submit_bio(READ, bio_split_front(bio, vecs * PAGE_SIZE << 9));
+
+ atomic_set(&wait, 1);
+ submit_bio(READ, bio);
+ wait_event(pending, atomic_read(&wait) == 0);
+
+ for (i = 0; i < c->sb.nbuckets; i++) {
+ atomic_set(&c->buckets[i].pin, 0);
+ c->buckets[i].heap = -1;
+ c->buckets[i].priority = le16_to_cpu(c->disk_buckets[i].priority);
+ c->buckets[i].generation = c->disk_buckets[i].generation;
+
+ if (c->buckets[i].priority != (uint16_t) ~0 &&
+ c->buckets[i].priority)
+ used++;
+
+ if (c->buckets[i].priority != (uint16_t) ~0)
+ if (c->buckets[i].priority != 0 || !fifo_push(&c->free, i))
+ __heap_insert(c, i);
+ }
+ pr_debug("Cache loaded, %li buckets in use", used);
+}
+
+static void save_priorities(struct cache_device *c)
+{
+ long i, n = (sizeof(struct bucket_disk) * c->sb.nbuckets) / PAGE_SIZE + 1;
+ struct bio *bio = bio_kmalloc(GFP_NOIO, n);
+
+ if (!bio)
+ return;
+
+ bio->bi_sector = PRIO_SECTOR;
+ bio->bi_bdev = c->bdev;
+ bio->bi_vcnt = n;
+ bio->bi_size = PAGE_SIZE * n;
+
+ bio->bi_private = c;
+ bio->bi_end_io = write_super_endio;
+
+ for (i = 0; i < n; i++) {
+ bio->bi_io_vec[i].bv_page =
+ vmalloc_to_page((void *) c->disk_buckets
+ + PAGE_SIZE * i);
+ bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+ bio->bi_io_vec[i].bv_offset = 0;
+ }
+
+ for (i = 0; i < c->sb.nbuckets; i++) {
+ c->disk_buckets[i].priority = cpu_to_le16(c->buckets[i].priority);
+ c->disk_buckets[i].generation = c->buckets[i].generation;
+ }
+
+ submit_bio(WRITE, bio);
+}
+
+static void register_dev_on_cache(struct cache_device *c, int d)
+{
+ int i;
+
+ for (i = 0; i < UUIDS_PER_SB; i++) {
+ if (is_zero(&c->uuids->b_data[i*16], 16)) {
+ pr_debug("inserted new uuid at %i", i);
+ memcpy(c->uuids->b_data + i*16, &uuids[d*16], 16);
+ set_buffer_dirty(c->uuids);
+ sync_dirty_buffer(c->uuids);
+ break;
+ }
+
+ if (!memcmp(&c->uuids->b_data[i*16], &uuids[d*16], 16)) {
+ /*
+ * Need to check if device was already opened
+ * read/write and invalidate previous data if it was.
+ */
+ pr_debug("looked up uuid at %i", i);
+ break;
+ }
+ }
+
+ if (i == UUIDS_PER_SB) {
+ printk(KERN_DEBUG "Aiee! No room for the uuid");
+ return;
+ }
+
+ c->devices[i] = d;
+}
+
+static int parse_uuid(const char *s, char *uuid)
+{
+ int i, j, x;
+ memset(uuid, 0, 16);
+
+ for (i = 0, j = 0;
+ i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
+ i++) {
+ x = s[i] | 32;
+
+ switch (x) {
+ case '0'...'9':
+ x -= '0';
+ break;
+ case 'a'...'f':
+ x -= 'a' - 10;
+ break;
+ default:
+ continue;
+ }
+
+ x <<= ((j & 1) << 2);
+ uuid[j++ >> 1] |= x;
+ }
+ return i;
+}
+
+/*static ssize_t store_dev(struct kobject *kobj, struct attribute *attr,
+ const char *buffer, size_t size)
+{
+ if (attr == &sysfs_unregister) {
+ }
+ return size;
+}
+
+static void unregister_dev(struct kobject *k)
+{
+
+}*/
+
+static void register_dev(const char *buffer, size_t size)
+{
+ int i;
+ const char *err = NULL;
+ char *path = NULL;
+ unsigned char uuid[16];
+ struct block_device *bdev = NULL;
+ struct cached_dev *d;
+ struct cache_device *c;
+
+ /*static struct attribute *files[] = {
+ &sysfs_unregister,
+ NULL
+ };
+ static const struct sysfs_ops ops = {
+ .show = NULL,
+ .store = store_dev
+ };
+ static struct kobj_type dev_obj = {
+ .release = unregister_dev,
+ .sysfs_ops = &ops,
+ .default_attrs = files
+ };*/
+
+ if (!try_module_get(THIS_MODULE))
+ return;
+
+ err = "Bad uuid";
+ i = parse_uuid(buffer, &uuid[0]);
+ if (i < 4)
+ goto err;
+
+ err = "Insufficient memory";
+ if (!(path = kmalloc(size + 1 - i, GFP_KERNEL)) ||
+ !(d = kzalloc(sizeof(*d), GFP_KERNEL)))
+ goto err;
+
+ strcpy(path, skip_spaces(buffer + i));
+ bdev = lookup_bdev(strim(path));
+
+ err = "Failed to open device";
+ if (IS_ERR(bdev))
+ goto err;
+
+ err = "Aready registered";
+ for (i = 0;
+ i < UUIDS_PER_SB && !is_zero(&uuids[i*16], 16);
+ i++)
+ if (!memcmp(&uuids[i*16], uuid, 16))
+ goto err;
+
+ err = "Max devices already open";
+ if (i == UUIDS_PER_SB)
+ goto err;
+
+#if 0
+ blkdev_get(bdev, FMODE_READ|FMODE_WRITE))
+ bdevname(bdev, b);
+ err = "Error creating kobject";
+ if (!kobject_get(bcache_kobj) ||
+ kobject_init_and_add(&d->kobj, &dev_obj,
+ bcache_kobj,
+ "%s", b))
+ goto err;
+#endif
+
+ memcpy(&uuids[i*16], uuid, 16);
+ bdev->bd_cache_identifier = i;
+ /*devices[i] = bdev->bd_disk;*/
+
+ list_for_each_entry(c, &cache_devices, list)
+ register_dev_on_cache(c, i);
+
+ bdev->bd_cache_fn = request_hook;
+ printk(KERN_DEBUG "bcache: Caching %s index %i", path, i);
+
+ if (0) {
+err: printk(KERN_DEBUG "bcache: error opening %s: %s", path, err);
+ if (bdev)
+ bdput(bdev);
+ kzfree(d);
+ }
+ kfree(path);
+}
+
+static void unregister_cache_kobj(struct work_struct *w)
+{
+ struct cache_device *c = container_of(w, struct cache_device, work);
+ list_del(&c->list);
+ INIT_LIST_HEAD(&c->list);
+ kobject_put(&c->kobj);
+}
+
+static ssize_t store_cache(struct kobject *kobj, struct attribute *attr,
+ const char *buffer, size_t size)
+{
+ struct cache_device *c = container_of(kobj, struct cache_device, kobj);
+ if (attr == &sysfs_unregister) {
+ INIT_WORK(&c->work, unregister_cache_kobj);
+ schedule_work(&c->work);
+ }
+ return size;
+}
+
+static ssize_t show_cache(struct kobject *kobj, struct attribute *attr,
+ char *buffer)
+{
+ struct cache_device *c = container_of(kobj, struct cache_device, kobj);
+
+ sysfs_print(bucket_size, "%i\n", c->sb.bucket_size << 9);
+ sysfs_print(nbuckets, "%lli\n", c->sb.nbuckets);
+ sysfs_print(cache_hits, "%lu\n", c->cache_hits);
+ sysfs_print(tree_depth, "%u\n", c->sb.btree_level);
+ sysfs_print(min_priority, "%u\n", c->heap[0] ? c->heap[0]->priority : 0);
+ return 0;
+}
+
+static const char *read_super(struct cache_device *c)
+{
+ const char *err;
+ struct cache_sb *s;
+ struct buffer_head *bh;
+
+ if (!(bh = __bread(c->bdev, 1, 4096)))
+ return "IO error";
+
+ err = "Not a bcache superblock";
+ s = (struct cache_sb *) bh->b_data;
+ if (memcmp(s->magic, bcache_magic, 16))
+ goto err;
+
+ c->sb.version = le32_to_cpu(s->version);
+ c->sb.block_size = le16_to_cpu(s->block_size);
+ c->sb.bucket_size = le16_to_cpu(s->bucket_size);
+ c->sb.journal_start = le32_to_cpu(s->journal_start);
+ c->sb.first_bucket = le32_to_cpu(s->first_bucket);
+ c->sb.nbuckets = le64_to_cpu(s->nbuckets);
+ c->sb.btree_root = le64_to_cpu(s->btree_root);
+ c->sb.btree_level = le16_to_cpu(s->btree_level);
+
+ err = "Unsupported superblock version";
+ if (c->sb.version > CACHE_CLEAN)
+ goto err;
+
+ err = "Bad block/bucket size";
+ if (!c->sb.block_size ||
+ c->sb.bucket_size & (PAGE_SIZE / 512 - 1) ||
+ c->sb.bucket_size < c->sb.block_size)
+ goto err;
+
+ err = "Too many buckets";
+ if (c->sb.nbuckets > LONG_MAX)
+ goto err;
+
+ err = "Invalid superblock";
+ if (c->sb.journal_start * c->sb.bucket_size <
+ 24 + (c->sb.nbuckets * sizeof(struct bucket_disk)) / 512)
+ goto err;
+
+ if (c->sb.first_bucket < c->sb.journal_start ||
+ get_capacity(c->bdev->bd_disk) < bucket_to_sector(c->sb.nbuckets))
+ goto err;
+
+ if (c->sb.btree_root < c->sb.first_bucket * c->sb.bucket_size ||
+ c->sb.btree_root >= bucket_to_sector(c->sb.nbuckets))
+ goto err;
+
+ err = "Bucket size must be multiple of page size";
+ if (!pages_per_btree ||
+ c->sb.bucket_size & ((PAGE_SIZE >> 9) - 1))
+ goto err;
+
+ err = NULL;
+
+ get_page(bh->b_page);
+ c->sb_page = bh->b_page;
+err:
+ put_bh(bh);
+ return err;
+}
+
+static void write_super(struct cache_device *c)
+{
+ struct cache_sb *s;
+ struct bio *bio = bio_kmalloc(GFP_NOIO, 1);
+
+ if (!bio)
+ return;
+
+ BUG_ON(list_empty(&c->list) != (c->sb.version & CACHE_CLEAN));
+
+ pr_debug("ver %i, root %llu, level %i",
+ c->sb.version, c->sb.btree_root, c->sb.btree_level);
+
+ bio->bi_sector = SB_SECTOR;
+ bio->bi_bdev = c->bdev;
+ bio->bi_vcnt = 1;
+ bio->bi_size = 4096;
+
+ bio->bi_private = c;
+ bio->bi_end_io = write_super_endio;
+
+ bio->bi_io_vec[0].bv_page = c->sb_page;
+ bio->bi_io_vec[0].bv_len = 4096;
+ bio->bi_io_vec[0].bv_offset = 0;
+
+ s = kmap(c->sb_page);
+
+ memcpy(s->magic, bcache_magic, 16);
+ s->version = cpu_to_le32(c->sb.version);
+ s->block_size = cpu_to_le16(c->sb.block_size);
+ s->bucket_size = cpu_to_le16(c->sb.bucket_size);
+ s->journal_start = cpu_to_le32(c->sb.journal_start);
+ s->first_bucket = cpu_to_le32(c->sb.first_bucket);
+ s->nbuckets = cpu_to_le64(c->sb.nbuckets);
+ s->btree_root = cpu_to_le64(c->sb.btree_root);
+ s->btree_level = cpu_to_le16(c->sb.btree_level);
+
+ kunmap(c->sb_page);
+ submit_bio(WRITE, bio);
+}
+
+static void free_cache(struct cache_device *c)
+{
+ struct cached_bucket *b;
+
+ while (!list_empty(&c->lru)) {
+ b = list_first_entry(&c->lru, struct cached_bucket, lru);
+ list_del(&b->lru);
+ free_bucket_contents(c, b);
+ kzfree(b);
+ }
+
+ kfree(c->gc_in_progress);
+
+ if (c->kobj.state_initialized) {
+ kobject_put(bcache_kobj);
+ kobject_put(&c->kobj);
+ }
+
+ if (c->root) {
+ free_bucket_contents(c, c->root);
+ kzfree(c->root);
+ }
+
+ free_fifo(&c->free);
+
+ vfree(c->disk_buckets);
+ vfree(c->buckets);
+ vfree(c->heap);
+ if (c->uuids)
+ put_bh(c->uuids);
+ if (c->sb_page)
+ put_page(c->sb_page);
+ if (!IS_ERR_OR_NULL(c->bdev))
+ close_bdev_exclusive(c->bdev, FMODE_READ|FMODE_WRITE);
+
+ module_put(c->owner);
+ kzfree(c);
+}
+
+static void register_cache(const char *buffer, size_t size)
+{
+ int i, n;
+ const char *err = NULL;
+ char *path = NULL, b[BDEVNAME_SIZE];
+ struct cache_device *c = NULL;
+ struct search_context s, *sp = &s;
+
+ static struct attribute *files[] = {
+ &sysfs_unregister,
+ &sysfs_bucket_size,
+ &sysfs_nbuckets,
+ &sysfs_cache_hits,
+ &sysfs_tree_depth,
+ &sysfs_min_priority,
+ NULL
+ };
+ static const struct sysfs_ops ops = {
+ .show = show_cache,
+ .store = store_cache
+ };
+ static struct kobj_type cache_obj = {
+ .release = unregister_cache,
+ .sysfs_ops = &ops,
+ .default_attrs = files
+ };
+
+ if (!try_module_get(THIS_MODULE))
+ return;
+
+ err = "Insufficient memory";
+ if (!(path = kmalloc(size + 1, GFP_KERNEL)) ||
+ !(c = kzalloc(sizeof(*c), GFP_KERNEL)))
+ goto err;
+
+ c->owner = THIS_MODULE;
+ INIT_LIST_HEAD(&c->lru);
+
+ strcpy(path, skip_spaces(buffer));
+
+ err = "Failed to open cache device";
+ c->bdev = open_bdev_exclusive(strim(path), FMODE_READ|FMODE_WRITE, c);
+ if (IS_ERR(c->bdev))
+ goto err;
+
+ set_blocksize(c->bdev, 4096);
+
+ if ((err = read_super(c)))
+ goto err;
+
+ err = "IO error reading UUIDs";
+ if (!(c->uuids = __bread(c->bdev, 2, PAGE_SIZE)))
+ goto err;
+
+ err = "Not enough buckets";
+ if (c->sb.nbuckets >> 7 <= 1)
+ goto err;
+
+ n = (c->sb.nbuckets - 1) / (PAGE_SIZE / sizeof(struct bucket_disk)) + 1;
+
+ err = "Insufficient memory";
+ if (!(c->heap = vmalloc(c->sb.nbuckets * sizeof(struct bucket *))) ||
+ !(c->buckets = vmalloc(c->sb.nbuckets * sizeof(struct bucket))) ||
+ !(c->disk_buckets = vmalloc(c->sb.nbuckets * sizeof(struct bucket_disk))) ||
+ !init_fifo(&c->free, c->sb.nbuckets >> 7, GFP_KERNEL))
+ goto err;
+
+ memset(c->heap, 0, c->sb.nbuckets * sizeof(struct bucket *));
+ memset(c->buckets, 0, c->sb.nbuckets * sizeof(struct bucket));
+
+ spin_lock_init(&c->sb_lock);
+ spin_lock_init(&c->bucket_lock);
+ init_rwsem(&c->gc_lock);
+
+ c->btree_buckets_cached = 10;
+
+ load_priorities(c);
+
+ memset(&s, 0, sizeof(s));
+ if (c->sb.version & CACHE_CLEAN)
+ c->root = get_bucket(c, c->sb.btree_root, c->sb.btree_level, true, &sp);
+ else
+ printk(KERN_DEBUG "bcache: Cache device %s was dirty, invalidating existing data", path);
+
+ c->sb.version &= ~CACHE_CLEAN;
+ if (!c->root) {
+ c->root = btree_alloc(c, 0, NULL, 0, 0, false);
+ if (!c->root)
+ goto err;
+ set_new_root(c, c->root);
+ } else
+ list_del(&c->root->lru);
+
+ rw_unlock(true, &c->root->lock);
+ BUG_ON(sector_to_priority(c->root->offset) != (uint16_t) ~0);
+
+ if (!(c->gc_in_progress = kmalloc(c->sb.nbuckets, GFP_KERNEL)))
+ goto err;
+
+ for (i = 0; i < c->sb.nbuckets; i++)
+ c->gc_in_progress[i] = c->buckets[i].generation;
+ do_btree_gc(&c->work);
+
+ for (i = 0; i < UUIDS_PER_SB; i++)
+ c->devices[i] = ~0;
+
+ for (i = 0; i < UUIDS_PER_SB && !is_zero(&uuids[i*16], 16); i++)
+ register_dev_on_cache(c, i);
+
+ err = "Error creating kobject";
+ bdevname(c->bdev, b);
+ if (!kobject_get(bcache_kobj) ||
+ kobject_init_and_add(&c->kobj, &cache_obj,
+ bcache_kobj,
+ "%s", b))
+ goto err;
+
+ list_add(&c->list, &cache_devices);
+
+ printk(KERN_DEBUG "bcache: Loaded cache device %s", path);
+ pr_debug("btree root at %li", c->root->offset);
+
+ if (0) {
+err: printk(KERN_DEBUG "bcache: error opening %s: %s", path, err);
+ if (c) {
+ if (c->bdev == ERR_PTR(EBUSY))
+ err = "Device busy";
+ free_cache(c);
+ }
+ }
+ kfree(path);
+}
+
+static void unregister_cache(struct kobject *k)
+{
+ struct cache_device *c = container_of(k, struct cache_device, kobj);
+ struct cached_bucket *b;
+
+ /*
+ * need to write out current key
+ */
+
+ list_for_each_entry(b, &c->lru, lru)
+ __btree_write_node(c, b, 0, pages_per_btree);
+
+ c->sb.version |= CACHE_CLEAN;
+ save_priorities(c);
+ write_super(c);
+ free_cache(c);
+}
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buffer)
+{
+ sysfs_print(cache_hits, "%lu\n", cache_hits);
+ sysfs_print(cache_hit_ratio, "%lu%%\n",
+ cache_hits * 100 / (cache_hits + cache_misses));
+ sysfs_print(cache_misses, "%lu\n", cache_misses);
+ sysfs_print(cache_priority_initial, "%i\n", initial_priority);
+ sysfs_print(cache_priority_hit, "%i\n", cache_hit_priority);
+ sysfs_print(cache_priority_seek, "%i\n", cache_hit_seek);
+ sysfs_print(cache_priority_rescale, "%li\n", rescale);
+ return 0;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+ const char *buffer, size_t size)
+{
+ if (attr == &sysfs_register_cache)
+ register_cache(buffer, size);
+ if (attr == &sysfs_register_dev)
+ register_dev(buffer, size);
+ sysfs_atoi(cache_priority_initial, initial_priority);
+ sysfs_atoi(cache_priority_hit, cache_hit_priority);
+ sysfs_atoi(cache_priority_seek, cache_hit_seek);
+ sysfs_atoi(cache_priority_rescale, rescale);
+
+ return size;
+}
+
+static int __init bcache_init(void)
+{
+ static const struct sysfs_ops ops = { .show = show, .store = store };
+ static const struct attribute *files[] = { &sysfs_register_cache,
+ &sysfs_register_dev,
+ &sysfs_cache_hits,
+ &sysfs_cache_hit_ratio,
+ &sysfs_cache_misses,
+ &sysfs_cache_priority_initial,
+ &sysfs_cache_priority_hit,
+ &sysfs_cache_priority_seek,
+ &sysfs_cache_priority_rescale,
+ NULL};
+
+ printk(KERN_DEBUG "bcache loading");
+
+ delayed = create_workqueue("bcache");
+ if (!delayed)
+ return -ENOMEM;
+
+ bcache_kobj = kobject_create_and_add("bcache", kernel_kobj);
+ if (!bcache_kobj)
+ return -ENOMEM;
+
+ bcache_kobj->ktype->sysfs_ops = &ops;
+ return sysfs_create_files(bcache_kobj, files);
+}
+
+static void bcache_exit(void)
+{
+ struct cache_device *c;
+
+ sysfs_remove_file(bcache_kobj, &sysfs_register_cache);
+ sysfs_remove_file(bcache_kobj, &sysfs_register_dev);
+
+ /*for (i = 0; i < UUIDS_PER_SB; i++)
+ if (devices[i] && devices[i])
+ devices[i]->bd_cache_fn = NULL;*/
+
+ list_for_each_entry(c, &cache_devices, list)
+ kobject_put(&c->kobj);
+}
+
+module_init(bcache_init);
+module_exit(bcache_exit);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/