[RFC] B+Tree library

From: JÃrn Engel
Date: Sun Oct 26 2008 - 08:47:16 EST


The idea of using btrees in the kernel is not exactly new. They have a
number of advantages over rbtrees and hashtables, but also a number of
drawbacks. More importantly, logfs already makes use of them and -
since I don't want any incompatible code to get merged and suffer the
trouble it creates - I would like to discuss my implementation and where
it makes sense and where it doesn't.

General advantages of btrees are memory density and efficient use of
cachelines. Hashtables are either too small and degrade into linked
list performance, or they are too large and waste memory. With changing
workloads, both may be true on the same system. Rbtrees have a bad
fanout of less than 2 (they are not actually balanced binary trees),
hence reading a fairly large number of cachelines to each lookup.

Main disadvantage of btrees is that they are complicated, come in a
gazillion subtly different variant that differ mainly in the balance
between read efficiency and write efficiency. Comparing btrees against
anything is a bit like comparing apples and random fruits.

This implementation is extremely simple. It splits nodes when they
overflow. It does not move elements to neighboring nodes. It does not
try fancy 2:3 splits. It does not even merge nodes when they shrink,
making degenerate cases possible. And it requires callers to do
tree-global locking. In effect, it will be hard to find anything less
sophisticated.

The one aspect where my implementation is actually nice is in allowing
variable key length. Btree keys are interpreted as an array of unsigned
long. So by passing the correct geometry to the core functions, it is
possible to handle 32bit, 64bit or 128bit btrees, which logfs uses. If
so desired, any other weird data format can be used as well (Zach, are
you reading this?).

So would something like this be merged once some users are identified?
Would it be useful for anything but logfs? Or am I plain nuts?

include/linux/btree.h | 272 ++++++++++++++++++++++
lib/Kconfig | 3
lib/Makefile | 1
lib/btree.c | 603 ++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 879 insertions(+)

JÃrn

--
It is the mark of an educated mind to be able to entertain a thought
without accepting it.
-- Aristotle

diff --git a/include/linux/btree.h b/include/linux/btree.h
new file mode 100644
index 0000000..d56a691
--- /dev/null
+++ b/include/linux/btree.h
@@ -0,0 +1,272 @@
+#ifndef BTREE_H
+#define BTREE_H
+
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+
+/*
+ * B+Tree node format:
+ * [key0, key1, ..., keyN] [val0, val1, ..., valN]
+ * Each key is an array of unsigned longs, head->no_longs in total.
+ * Total number of keys and vals (N) is head->no_pairs.
+ */
+
+struct btree_head {
+ unsigned long *node;
+ mempool_t *mempool;
+ int height;
+ gfp_t gfp_mask;
+};
+
+struct btree_geo {
+ int keylen;
+ int no_pairs;
+};
+extern struct btree_geo btree_geo32;
+extern struct btree_geo btree_geo64;
+extern struct btree_geo btree_geo128;
+
+struct btree_headl { struct btree_head h; };
+struct btree_head32 { struct btree_head h; };
+struct btree_head64 { struct btree_head h; };
+struct btree_head128 { struct btree_head h; };
+
+/*
+ * These couple of functions are all there is to it. The rest of this header
+ * consists only of wrappers that try to add some typesafety, make the code
+ * a little self-documenting and generally be nice to people.
+ */
+void *btree_alloc(gfp_t gfp_mask, void *pool_data);
+void btree_free(void *element, void *pool_data);
+void btree_init(struct btree_head *head, mempool_t *mempool, gfp_t gfp);
+void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key);
+int btree_insert(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, void *val);
+void *btree_remove(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key);
+int btree_merge(struct btree_head *target, struct btree_head *victim,
+ struct btree_geo *geo, unsigned long *duplicate);
+unsigned long *btree_last(struct btree_head *head, struct btree_geo *geo);
+size_t btree_visitor(struct btree_head *head, struct btree_geo *geo,
+ long opaque,
+ void (*func)(void *elem, long opaque, unsigned long *key,
+ size_t index, void *func2), void *func2);
+size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo,
+ long opaque,
+ void (*func)(void *elem, long opaque, unsigned long *key,
+ size_t index, void *func2), void *func2);
+
+/* key is unsigned long */
+static inline void btree_initl(struct btree_headl *head, mempool_t *mempool,
+ gfp_t gfp)
+{
+ btree_init(&head->h, mempool, gfp);
+}
+
+static inline void *btree_lookupl(struct btree_headl *head, unsigned long key)
+{
+ return btree_lookup(&head->h, &btree_geo32, &key);
+}
+
+static inline int btree_insertl(struct btree_headl *head, unsigned long key,
+ void *val)
+{
+ return btree_insert(&head->h, &btree_geo32, &key, val);
+}
+
+static inline void *btree_removel(struct btree_headl *head, unsigned long key)
+{
+ return btree_remove(&head->h, &btree_geo32, &key);
+}
+
+static inline int btree_mergel(struct btree_headl *target,
+ struct btree_headl *victim)
+{
+ unsigned long scratch;
+
+ return btree_merge(&target->h, &victim->h, &btree_geo32, &scratch);
+}
+
+void visitorl(void *elem, long opaque, unsigned long *key, size_t index,
+ void *__func);
+
+typedef void (*visitorl_t)(void *elem, long opaque, unsigned long key,
+ size_t index);
+
+static inline size_t btree_visitorl(struct btree_headl *head, long opaque,
+ visitorl_t func2)
+{
+ return btree_visitor(&head->h, &btree_geo32, opaque, visitorl, func2);
+}
+
+static inline size_t btree_grim_visitorl(struct btree_headl *head, long opaque,
+ visitorl_t func2)
+{
+ return btree_grim_visitor(&head->h, &btree_geo32, opaque, visitorl, func2);
+}
+
+/* key is u32 */
+static inline void btree_init32(struct btree_head32 *head, mempool_t *mempool,
+ gfp_t gfp)
+{
+ btree_init(&head->h, mempool, gfp);
+}
+
+static inline void *btree_lookup32(struct btree_head32 *head, u32 key)
+{
+ return btree_lookup(&head->h, &btree_geo32, (unsigned long *)&key);
+}
+
+static inline int btree_insert32(struct btree_head32 *head, u32 key,
+ void *val)
+{
+ return btree_insert(&head->h, &btree_geo32, (unsigned long *)&key, val);
+}
+
+static inline void *btree_remove32(struct btree_head32 *head, u32 key)
+{
+ return btree_remove(&head->h, &btree_geo32, (unsigned long *)&key);
+}
+
+static inline int btree_merge32(struct btree_head32 *target,
+ struct btree_head32 *victim)
+{
+ unsigned long scratch;
+
+ return btree_merge(&target->h, &victim->h, &btree_geo32, &scratch);
+}
+
+void visitor32(void *elem, long opaque, unsigned long *__key, size_t index,
+ void *__func);
+
+typedef void (*visitor32_t)(void *elem, long opaque, u32 key, size_t index);
+
+static inline size_t btree_visitor32(struct btree_head32 *head, long opaque,
+ visitor32_t func2)
+{
+ return btree_visitor(&head->h, &btree_geo32, opaque, visitor32, func2);
+}
+
+static inline size_t btree_grim_visitor32(struct btree_head32 *head, long opaque,
+ visitor32_t func2)
+{
+ return btree_grim_visitor(&head->h, &btree_geo32, opaque, visitor32, func2);
+}
+
+/* key is u64 */
+static inline void btree_init64(struct btree_head64 *head, mempool_t *mempool,
+ gfp_t gfp)
+{
+ btree_init(&head->h, mempool, gfp);
+}
+
+static inline void *btree_lookup64(struct btree_head64 *head, u64 key)
+{
+ return btree_lookup(&head->h, &btree_geo64, (unsigned long *)&key);
+}
+
+static inline int btree_insert64(struct btree_head64 *head, u64 key,
+ void *val)
+{
+ return btree_insert(&head->h, &btree_geo64, (unsigned long *)&key, val);
+}
+
+static inline void *btree_remove64(struct btree_head64 *head, u64 key)
+{
+ return btree_remove(&head->h, &btree_geo64, (unsigned long *)&key);
+}
+
+static inline int btree_merge64(struct btree_head64 *target,
+ struct btree_head64 *victim)
+{
+ u64 scratch;
+
+ return btree_merge(&target->h, &victim->h, &btree_geo64,
+ (unsigned long *)&scratch);
+}
+
+void visitor64(void *elem, long opaque, unsigned long *__key, size_t index,
+ void *__func);
+
+typedef void (*visitor64_t)(void *elem, long opaque, u64 key, size_t index);
+
+static inline size_t btree_visitor64(struct btree_head64 *head, long opaque,
+ visitor64_t func2)
+{
+ return btree_visitor(&head->h, &btree_geo64, opaque, visitor64, func2);
+}
+
+static inline size_t btree_grim_visitor64(struct btree_head64 *head, long opaque,
+ visitor64_t func2)
+{
+ return btree_grim_visitor(&head->h, &btree_geo64, opaque, visitor64, func2);
+}
+
+/* key is 128bit (two u64) */
+static inline void btree_init128(struct btree_head128 *head, mempool_t *mempool,
+ gfp_t gfp)
+{
+ btree_init(&head->h, mempool, gfp);
+}
+
+static inline void *btree_lookup128(struct btree_head128 *head, u64 k1, u64 k2)
+{
+ u64 key[2] = {k1, k2};
+ return btree_lookup(&head->h, &btree_geo128, (unsigned long *)&key);
+}
+
+static inline int btree_insert128(struct btree_head128 *head, u64 k1, u64 k2,
+ void *val)
+{
+ u64 key[2] = {k1, k2};
+ return btree_insert(&head->h, &btree_geo128, (unsigned long *)&key, val);
+}
+
+static inline void *btree_remove128(struct btree_head128 *head, u64 k1, u64 k2)
+{
+ u64 key[2] = {k1, k2};
+ return btree_remove(&head->h, &btree_geo128, (unsigned long *)&key);
+}
+
+static inline void btree_last128(struct btree_head128 *head, u64 *k1, u64 *k2)
+{
+ u64 *key = (u64 *)btree_last(&head->h, &btree_geo128);
+
+ if (key) {
+ *k1 = key[0];
+ *k2 = key[1];
+ } else {
+ *k1 = 0;
+ *k2 = 0;
+ }
+}
+
+static inline int btree_merge128(struct btree_head128 *target,
+ struct btree_head128 *victim)
+{
+ u64 scratch[2];
+
+ return btree_merge(&target->h, &victim->h, &btree_geo128,
+ (unsigned long *)scratch);
+}
+
+void visitor128(void *elem, long opaque, unsigned long *__key, size_t index,
+ void *__func);
+
+typedef void (*visitor128_t)(void *elem, long opaque, u64 key1, u64 key2,
+ size_t index);
+
+static inline size_t btree_visitor128(struct btree_head128 *head, long opaque,
+ visitor128_t func2)
+{
+ return btree_visitor(&head->h, &btree_geo128, opaque, visitor128, func2);
+}
+
+static inline size_t btree_grim_visitor128(struct btree_head128 *head, long opaque,
+ visitor128_t func2)
+{
+ return btree_grim_visitor(&head->h, &btree_geo128, opaque, visitor128, func2);
+}
+
+#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index cd6d02c..f07a8bd 100644
diff --git a/lib/Kconfig b/lib/Kconfig
index 8cc8e87..7614216 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -129,6 +129,9 @@ config TEXTSEARCH_FSM
config PLIST
boolean

+config BTREE
+ boolean
+
config HAS_IOMEM
boolean
depends on !NO_IOMEM
diff --git a/lib/Makefile b/lib/Makefile
index 2d7001b..b1eed60 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,6 +34,7 @@ lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
obj-$(CONFIG_PLIST) += plist.o
+obj-$(CONFIG_BTREE) += btree.o
obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
obj-$(CONFIG_DEBUG_LIST) += list_debug.o

diff --git a/lib/btree.c b/lib/btree.c
new file mode 100644
index 0000000..e0770da
--- /dev/null
+++ b/lib/btree.c
@@ -0,0 +1,603 @@
+/*
+ * lib/btree.c - Simple In-memory B+Tree
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2007-2008 Joern Engel <joern@xxxxxxxxx>
+ * Bits and pieces stolen from Peter Zijlstra's code, which is
+ * Copyright 2007, Red Hat Inc. Peter Zijlstra <pzijlstr@xxxxxxxxxx>
+ * GPLv2
+ *
+ * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch
+ *
+ * A relatively simple B+Tree implementation. I have written it as a learning
+ * excercise to understand how B+Trees work. Turned out to be useful as well.
+ *
+ * B+Trees can be used similar to Linux radix trees (which don't have anything
+ * in common with textbook radix trees, beware). Prerequisite for them working
+ * well is that access to a random tree node is much faster than a large number
+ * of operations within each node.
+ *
+ * Disks have fulfilled the prerequite for a long time. More recently DRAM
+ * has gained similar properties, as memory access times, when measured in cpu
+ * cycles, have increased. Cacheline sizes have increased as well, which also
+ * helps B+Trees.
+ *
+ * Compared to radix trees, B+Trees are more efficient when dealing with a
+ * sparsely populated address space. Between 25% and 50% of the memory is
+ * occupied with valid pointers. When densely populated, radix trees contain
+ * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2%
+ * pointers.
+ *
+ * This particular implementation stores pointers identified by a long value.
+ * Storing NULL pointers is illegal, lookup will return NULL when no entry
+ * was found.
+ *
+ * Two tricks were used that are not commonly found in textbooks. First, the
+ * lowest values are to the right, not to the left. All used slots within a
+ * node are on the left, all unused slots contain NUL values. Most operations
+ * simply loop once over all slots and terminate on the first NUL.
+ *
+ * Second trick is to special-case the key "0" or NUL. As seen above, this
+ * value indicates an unused slot, so such a value should not be stored in the
+ * tree itself. Instead it is stored in the null_ptr field in the btree_head.
+ */
+
+#include <linux/btree.h>
+#include <linux/cache.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define NODESIZE MAX(L1_CACHE_BYTES, 128)
+
+struct btree_geo btree_geo32 = {
+ .keylen = 1,
+ .no_pairs = NODESIZE / sizeof(long) / 2,
+};
+
+#define LONG_PER_U64 (64 / BITS_PER_LONG)
+struct btree_geo btree_geo64 = {
+ .keylen = LONG_PER_U64,
+ .no_pairs = NODESIZE / sizeof(long) / (1 + LONG_PER_U64),
+};
+
+struct btree_geo btree_geo128 = {
+ .keylen = 2 * LONG_PER_U64,
+ .no_pairs = NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64),
+};
+
+static struct kmem_cache *btree_cachep;
+
+void *btree_alloc(gfp_t gfp_mask, void *pool_data)
+{
+ return kmem_cache_alloc(btree_cachep, gfp_mask);
+}
+
+void btree_free(void *element, void *pool_data)
+{
+ kmem_cache_free(btree_cachep, element);
+}
+
+static unsigned long *btree_node_alloc(struct btree_head *head)
+{
+ unsigned long *node;
+
+ node = mempool_alloc(head->mempool, head->gfp_mask);
+ memset(node, 0, NODESIZE);
+ return node;
+}
+
+static int longcmp(const unsigned long *l1, const unsigned long *l2, size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++) {
+ if (l1[i] < l2[i])
+ return -1;
+ if (l1[i] > l2[i])
+ return 1;
+ }
+ return 0;
+}
+
+static unsigned long *longcpy(unsigned long *dest, const unsigned long *src,
+ size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++)
+ dest[i] = src[i];
+ return dest;
+}
+
+static unsigned long *longset(unsigned long *s, unsigned long c, size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++)
+ s[i] = c;
+ return s;
+}
+
+/*
+ * B+Tree node format:
+ * [key0, key1, ..., keyN] [val0, val1, ..., valN]
+ * Each key is an array of unsigned longs, head->keylen in total.
+ * Total number of keys and vals (N) is head->no_pairs.
+ */
+
+static unsigned long *bkey(struct btree_geo *geo, unsigned long *node, int n)
+{
+ return &node[n * geo->keylen];
+}
+
+static unsigned long bval(struct btree_geo *geo, unsigned long *node, int n)
+{
+ return node[geo->no_pairs * geo->keylen + n];
+}
+
+static void setkey(struct btree_geo *geo, unsigned long *node,
+ unsigned long *key, int n)
+{
+ longcpy(bkey(geo, node, n), key, geo->keylen);
+}
+
+static void setval(struct btree_geo *geo, unsigned long *node,
+ unsigned long val, int n)
+{
+ node[geo->no_pairs * geo->keylen + n] = val;
+}
+
+static void clearpair(struct btree_geo *geo, unsigned long *node, int n)
+{
+ longset(bkey(geo, node, n), 0, geo->keylen);
+ node[geo->no_pairs * geo->keylen + n] = 0;
+}
+
+#if 0
+static void dumpkey(struct btree_geo *geo, unsigned long *key)
+{
+ int k;
+
+ printk("(%lx", key[0]);
+ for (k = 1; k < geo->keylen; k++)
+ printk(",%lx", key[k]);
+ printk(")");
+}
+
+static void dumpnode(struct btree_geo *geo, unsigned long *node)
+{
+ int i;
+ unsigned long *key;
+
+ printk("%p: ", node);
+ for (i = 0; i < geo->no_pairs; i++) {
+ key = bkey(geo, node, i);
+ dumpkey(geo, key);
+ printk(" %lx ", bval(geo, node, i));
+ }
+ printk("\n");
+}
+
+static void __dumptree(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *node, int height)
+{
+ int i;
+ unsigned long *child;
+
+ if (!height)
+ return;
+
+ printk("%2x ", height);
+ dumpnode(geo, node);
+ for (i = 0; i < geo->no_pairs; i++) {
+ child = (void *)bval(geo, node, i);
+ if (!child)
+ return;
+ __dumptree(head, geo, child, height - 1);
+ }
+}
+
+static void dumptree(struct btree_head *head, struct btree_geo *geo)
+{
+ __dumptree(head, geo, head->node, head->height);
+}
+#endif
+
+static inline void __btree_init(struct btree_head *head)
+{
+ head->node = NULL;
+ head->height = 0;
+}
+
+void btree_init(struct btree_head *head, mempool_t *mempool, gfp_t gfp)
+{
+ __btree_init(head);
+ head->mempool = mempool;
+ head->gfp_mask = gfp;
+}
+
+unsigned long *btree_last(struct btree_head *head, struct btree_geo *geo)
+{
+ int height = head->height;
+ unsigned long *node = head->node;
+
+ if (height == 0)
+ return NULL;
+
+ for ( ; height > 1; height--)
+ node = (unsigned long *)bval(geo, node, 0);
+
+ return bkey(geo, node, 0);
+}
+
+static int keycmp(struct btree_geo *geo, unsigned long *node, int pos,
+ unsigned long *key)
+{
+ return longcmp(bkey(geo, node, pos), key, geo->keylen);
+}
+
+void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key)
+{
+ int i, height = head->height;
+ unsigned long *node = head->node;
+
+ if (height == 0)
+ return NULL;
+
+ for ( ; height > 1; height--) {
+ for (i = 0; i < geo->no_pairs; i++)
+ if (keycmp(geo, node, i, key) <= 0)
+ break;
+ if (i == geo->no_pairs)
+ return NULL;
+ node = (unsigned long *)bval(geo, node, i);
+ if (!node)
+ return NULL;
+ }
+
+ if (!node)
+ return NULL;
+
+ for (i = 0; i < geo->no_pairs; i++)
+ if (keycmp(geo, node, i, key) == 0)
+ return (void *)bval(geo, node, i);
+ return NULL;
+}
+
+static int getpos(struct btree_geo *geo, unsigned long *node,
+ unsigned long *key)
+{
+ int i;
+
+ for (i = 0; i < geo->no_pairs; i++) {
+ if (keycmp(geo, node, i, key) <= 0)
+ break;
+ }
+ return i;
+}
+
+static int getfill(struct btree_geo *geo, unsigned long *node, int start)
+{
+ int i;
+
+ for (i = start; i < geo->no_pairs; i++)
+ if (bval(geo, node, i) == 0)
+ break;
+ return i;
+}
+
+/*
+ * locate the correct leaf node in the btree
+ */
+static unsigned long *find_level(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, int level)
+{
+ unsigned long *node = head->node;
+ int i, height;
+
+ for (height = head->height; height > level; height--) {
+ for (i = 0; i < geo->no_pairs; i++)
+ if (keycmp(geo, node, i, key) <= 0)
+ break;
+
+ if ((i == geo->no_pairs) || !bval(geo, node, i)) {
+ /* right-most key is too large, update it */
+ i--;
+ setkey(geo, node, key, i);
+ }
+ BUG_ON(i < 0);
+ node = (unsigned long *)bval(geo, node, i);
+ }
+ BUG_ON(!node);
+ return node;
+}
+
+static int btree_grow(struct btree_head *head, struct btree_geo *geo)
+{
+ unsigned long *node;
+ int fill;
+
+ node = btree_node_alloc(head);
+ if (!node)
+ return -ENOMEM;
+ if (head->node) {
+ fill = getfill(geo, head->node, 0);
+ setkey(geo, node, bkey(geo, head->node, fill - 1), 0);
+ setval(geo, node, (unsigned long)head->node, 0);
+ }
+ head->node = node;
+ head->height++;
+ return 0;
+}
+
+static int btree_insert_level(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, unsigned long val, int level)
+{
+ unsigned long *node;
+ int i, pos, fill, err;
+
+ BUG_ON(!val);
+ if (head->height < level) {
+ err = btree_grow(head, geo);
+ if (err)
+ return err;
+ }
+
+retry:
+ node = find_level(head, geo, key, level);
+ pos = getpos(geo, node, key);
+ fill = getfill(geo, node, pos);
+ /* two identical keys are not allowed */
+ BUG_ON(pos < fill && keycmp(geo, node, pos, key) == 0);
+
+ if (fill == geo->no_pairs) {
+ /* need to split node */
+ unsigned long *new;
+
+ new = btree_node_alloc(head);
+ if (!new)
+ return -ENOMEM;
+ err = btree_insert_level(head, geo,
+ bkey(geo, node, fill / 2 - 1),
+ (unsigned long)new, level + 1);
+ if (err) {
+ kfree(new);
+ return err;
+ }
+ for (i = 0; i < fill / 2; i++) {
+ setkey(geo, new, bkey(geo, node, i), i);
+ setval(geo, new, bval(geo, node, i), i);
+ setkey(geo, node, bkey(geo, node, i + fill / 2), i);
+ setval(geo, node, bval(geo, node, i + fill / 2), i);
+ clearpair(geo, node, i + fill / 2);
+ }
+ if (fill & 1) {
+ setkey(geo, node, bkey(geo, node, fill - 1), i);
+ setval(geo, node, bval(geo, node, fill - 1), i);
+ clearpair(geo, node, fill - 1);
+ }
+ goto retry;
+ }
+ BUG_ON(fill >= geo->no_pairs);
+
+ /* shift and insert */
+ for (i = fill; i > pos; i--) {
+ setkey(geo, node, bkey(geo, node, i - 1), i);
+ setval(geo, node, bval(geo, node, i - 1), i);
+ }
+ setkey(geo, node, key, pos);
+ setval(geo, node, val, pos);
+
+ return 0;
+}
+
+int btree_insert(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, void *val)
+{
+ return btree_insert_level(head, geo, key, (unsigned long)val, 1);
+}
+
+static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, int level)
+{
+ unsigned long *node;
+ int i, pos, fill;
+ void *ret;
+
+ if (level > head->height) {
+ /* we recursed all the way up */
+ head->height = 0;
+ head->node = NULL;
+ return NULL;
+ }
+
+ node = find_level(head, geo, key, level);
+ pos = getpos(geo, node, key);
+ fill = getfill(geo, node, pos);
+ if ((level == 1) && (keycmp(geo, node, pos, key) != 0))
+ return NULL;
+ ret = (void *)bval(geo, node, pos);
+
+ /* remove and shift */
+ for (i = pos; i < fill - 1; i++) {
+ setkey(geo, node, bkey(geo, node, i + 1), i);
+ setval(geo, node, bval(geo, node, i + 1), i);
+ }
+ clearpair(geo, node, fill - 1);
+
+ if (fill - 1 < geo->no_pairs / 2) {
+ /*
+ * At this point there *should* be code to either merge with
+ * a neighboring node or steal some entries from it to preserve
+ * the btree invariant of only having nodes with n/2..n
+ * elements.
+ *
+ * As you can see, that code is left as an excercise to the
+ * reader or anyone noticing severe performance problems in
+ * very rare cases.
+ *
+ * As-is this code "implements" a method called lazy deletion,
+ * which according to text books is relatively common in
+ * databases and usually works quite well.
+ * Not so usually, the btree can degrade into very long lists
+ * of 1-element nodes and perform accordingly.
+ */
+ }
+ if (fill - 1 == 0) {
+ btree_remove_level(head, geo, key, level + 1);
+ kfree(node);
+ }
+
+ return ret;
+}
+
+void *btree_remove(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key)
+{
+ if (head->height == 0)
+ return NULL;
+
+ return btree_remove_level(head, geo, key, 1);
+}
+
+int btree_merge(struct btree_head *target, struct btree_head *victim,
+ struct btree_geo *geo, unsigned long *duplicate)
+{
+ unsigned long *key;
+ void *val;
+ int err;
+
+ BUG_ON(target == victim);
+
+ if (!(target->node)) {
+ /* target is empty, just copy fields over */
+ target->node = victim->node;
+ target->height = victim->height;
+ __btree_init(victim);
+ return 0;
+ }
+
+ for (;;) {
+ key = btree_last(victim, geo);
+ if (!key)
+ break;
+ val = btree_lookup(victim, geo, key);
+ err = btree_insert(target, geo, key, val);
+ if (err)
+ return err;
+ /* We must make a copy of the key, as the original will get
+ * mangled inside btree_remove. */
+ longcpy(duplicate, key, geo->keylen);
+ btree_remove(victim, geo, duplicate);
+ }
+ return 0;
+}
+
+static size_t __btree_for_each(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *node, long opaque,
+ void (*func)(void *elem, long opaque,
+ unsigned long *key, size_t index, void *func2),
+ void *func2, int reap, int height, size_t count)
+{
+ int i;
+ unsigned long *child;
+
+ for (i = 0; i < geo->no_pairs; i++) {
+ child = (void *)bval(geo, node, i);
+ if (!child)
+ break;
+ if (height > 1)
+ count = __btree_for_each(head, geo, child, opaque,
+ func, func2, reap, height - 1, count);
+ else
+ func(child, opaque, bkey(geo, node, i), count++,
+ func2);
+ }
+ if (reap)
+ kfree(node);
+ return count;
+}
+
+static void empty(void *elem, long opaque, unsigned long *key, size_t index,
+ void *func2)
+{
+}
+
+void visitorl(void *elem, long opaque, unsigned long *key, size_t index,
+ void *__func)
+{
+ visitorl_t func = __func;
+
+ func(elem, opaque, *key, index);
+}
+
+void visitor32(void *elem, long opaque, unsigned long *__key, size_t index,
+ void *__func)
+{
+ visitor32_t func = __func;
+ u32 *key = (void *)__key;
+
+ func(elem, opaque, *key, index);
+}
+
+void visitor64(void *elem, long opaque, unsigned long *__key, size_t index,
+ void *__func)
+{
+ visitor64_t func = __func;
+ u64 *key = (void *)__key;
+
+ func(elem, opaque, *key, index);
+}
+
+void visitor128(void *elem, long opaque, unsigned long *__key, size_t index,
+ void *__func)
+{
+ visitor128_t func = __func;
+ u64 *key = (void *)__key;
+
+ func(elem, opaque, key[0], key[1], index);
+}
+
+size_t btree_visitor(struct btree_head *head, struct btree_geo *geo,
+ long opaque,
+ void (*func)(void *elem, long opaque, unsigned long *key,
+ size_t index, void *func2), void *func2)
+{
+ size_t count = 0;
+
+ if (!func)
+ func = empty;
+ if (head->node)
+ count = __btree_for_each(head, geo, head->node, opaque, func,
+ func2, 0, head->height, 0);
+ return count;
+}
+
+size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo,
+ long opaque,
+ void (*func)(void *elem, long opaque, unsigned long *key,
+ size_t index, void *func2), void *func2)
+{
+ size_t count = 0;
+
+ if (!func)
+ func = empty;
+ if (head->node)
+ count = __btree_for_each(head, geo, head->node, opaque, func,
+ func2, 1, head->height, 0);
+ __btree_init(head);
+ return count;
+}
+
+static int __init btree_module_init(void)
+{
+ btree_cachep = kmem_cache_create("btree_node", NODESIZE, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ return 0;
+}
+
+/* If core code starts using btree, initialization should happen even earlier */
+module_init(btree_module_init);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/