[PATCH 20/29] radix tree: Improve multiorder iterators

From: Matthew Wilcox
Date: Wed Nov 16 2016 - 17:37:05 EST


From: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>

This fixes several interlinked problems with the iterators in the
presence of multiorder entries.

1. radix_tree_iter_next() would only advance by one slot, which would
result in the iterators returning the same entry more than once if there
were sibling entries.

2. radix_tree_next_slot() could return an internal pointer instead of
a user pointer if a tagged multiorder entry was immediately followed by
an entry of lower order.

3. radix_tree_next_slot() expanded to a lot more code than it used to
when multiorder support was compiled in. And I wasn't comfortable with
entry_to_node() being in a header file.

Fixing radix_tree_iter_next() for the presence of sibling entries
necessarily involves examining the contents of the radix tree, so we now
need to pass 'slot' to radix_tree_iter_next(), and we need to change the
calling convention so it is called *before* dropping the lock which
protects the tree. Fortunately, unconverted code won't compile.

radix_tree_next_slot() becomes closer to how it looked before multiorder
support was introduced. It only checks to see if the next entry in the
chunk is a sibling entry or a pointer to a node; this should be rare
enough that handling this case out of line is not a performance impact
(and such impact is amortised by the fact that the entry we just
processed was a multiorder entry). Also, radix_tree_next_slot() used
to force a new chunk lookup for untagged entries, which is more
expensive than the out of line sibling entry skipping.

Signed-off-by: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
---
fs/btrfs/tests/btrfs-tests.c | 2 +-
include/linux/radix-tree.h | 63 ++++++++------------
lib/radix-tree.c | 94 ++++++++++++++++++++++++++++++
mm/khugepaged.c | 2 +-
mm/shmem.c | 6 +-
tools/testing/radix-tree/iteration_check.c | 4 +-
tools/testing/radix-tree/multiorder.c | 12 ++++
tools/testing/radix-tree/regression3.c | 6 +-
tools/testing/radix-tree/test.h | 1 +
9 files changed, 142 insertions(+), 48 deletions(-)

diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 73076a0..6d3457a 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -162,7 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
slot = radix_tree_iter_retry(&iter);
continue;
}
- slot = radix_tree_iter_next(&iter);
+ slot = radix_tree_iter_next(slot, &iter);
spin_unlock(&fs_info->buffer_lock);
free_extent_buffer_stale(eb);
spin_lock(&fs_info->buffer_lock);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 66fb8c0..36c6175 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -424,15 +424,10 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
*
* If the iterator needs to release then reacquire a lock, the chunk may
* have been invalidated by an insertion or deletion. Call this function
- * to continue the iteration from the next index.
+ * before releasing the lock to continue the iteration from the next index.
*/
-static inline __must_check
-void **radix_tree_iter_next(struct radix_tree_iter *iter)
-{
- iter->next_index = __radix_tree_iter_add(iter, 1);
- iter->tags = 0;
- return NULL;
-}
+void **__must_check radix_tree_iter_next(void **slot,
+ struct radix_tree_iter *iter);

/**
* radix_tree_chunk_size - get current chunk size
@@ -446,10 +441,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
return (iter->next_index - iter->index) >> iter_shift(iter);
}

-static inline struct radix_tree_node *entry_to_node(void *ptr)
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
+ unsigned flags);
+#else
+/* Can't happen without sibling entries, but the compiler can't tell that */
+static inline void ** __radix_tree_next_slot(void **slot,
+ struct radix_tree_iter *iter, unsigned flags)
{
- return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
+ return slot;
}
+#endif

/**
* radix_tree_next_slot - find next slot in chunk
@@ -474,51 +476,31 @@ static __always_inline void **
radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
{
if (flags & RADIX_TREE_ITER_TAGGED) {
- void *canon = slot;
-
iter->tags >>= 1;
if (unlikely(!iter->tags))
return NULL;
- while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
- radix_tree_is_internal_node(slot[1])) {
- if (entry_to_node(slot[1]) == canon) {
- iter->tags >>= 1;
- iter->index = __radix_tree_iter_add(iter, 1);
- slot++;
- continue;
- }
- iter->next_index = __radix_tree_iter_add(iter, 1);
- return NULL;
- }
if (likely(iter->tags & 1ul)) {
iter->index = __radix_tree_iter_add(iter, 1);
- return slot + 1;
+ slot++;
+ goto found;
}
if (!(flags & RADIX_TREE_ITER_CONTIG)) {
unsigned offset = __ffs(iter->tags);

- iter->tags >>= offset;
- iter->index = __radix_tree_iter_add(iter, offset + 1);
- return slot + offset + 1;
+ iter->tags >>= offset++;
+ iter->index = __radix_tree_iter_add(iter, offset);
+ slot += offset;
+ goto found;
}
} else {
long count = radix_tree_chunk_size(iter);
- void *canon = slot;

while (--count > 0) {
slot++;
iter->index = __radix_tree_iter_add(iter, 1);

- if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
- radix_tree_is_internal_node(*slot)) {
- if (entry_to_node(*slot) == canon)
- continue;
- iter->next_index = iter->index;
- break;
- }
-
if (likely(*slot))
- return slot;
+ goto found;
if (flags & RADIX_TREE_ITER_CONTIG) {
/* forbid switching to the next chunk */
iter->next_index = 0;
@@ -527,6 +509,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
}
}
return NULL;
+
+ found:
+ if (unlikely(radix_tree_is_internal_node(*slot)))
+ return __radix_tree_next_slot(slot, iter, flags);
+ return slot;
}

/**
@@ -577,6 +564,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
slot || (slot = radix_tree_next_chunk(root, iter, \
RADIX_TREE_ITER_TAGGED | tag)) ; \
slot = radix_tree_next_slot(slot, iter, \
- RADIX_TREE_ITER_TAGGED))
+ RADIX_TREE_ITER_TAGGED | tag))

#endif /* _LINUX_RADIX_TREE_H */
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 09c5f1d..27b53ef 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -69,6 +69,11 @@ struct radix_tree_preload {
};
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };

+static inline struct radix_tree_node *entry_to_node(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
+}
+
static inline void *node_to_entry(void *ptr)
{
return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
@@ -1138,6 +1143,95 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter,
#endif
}

+static void ** __radix_tree_iter_next(struct radix_tree_node **nodep,
+ void **slot, struct radix_tree_iter *iter)
+{
+ void *sib = node_to_entry(slot - 1);
+
+ while (iter->index < iter->next_index) {
+ *nodep = rcu_dereference_raw(*slot);
+ if (*nodep && *nodep != sib)
+ return slot;
+ slot++;
+ iter->index = __radix_tree_iter_add(iter, 1);
+ iter->tags >>= 1;
+ }
+
+ *nodep = NULL;
+ return NULL;
+}
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
+ unsigned flags)
+{
+ unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
+ struct radix_tree_node *node = rcu_dereference_raw(*slot);
+
+ slot = __radix_tree_iter_next(&node, slot, iter);
+
+ while (radix_tree_is_internal_node(node)) {
+ unsigned offset;
+
+ if (node == RADIX_TREE_RETRY)
+ return slot;
+ node = entry_to_node(node);
+
+ if (flags & RADIX_TREE_ITER_TAGGED) {
+ unsigned tag_long, tag_bit;
+ offset = radix_tree_find_next_bit(node, tag, 0);
+ if (offset == RADIX_TREE_MAP_SIZE)
+ return NULL;
+ slot = &node->slots[offset];
+
+ tag_long = offset / BITS_PER_LONG;
+ tag_bit = offset % BITS_PER_LONG;
+ iter->tags = node->tags[tag][tag_long] >> tag_bit;
+ BUG_ON(iter->tags >= (RADIX_TREE_MAP_SIZE * 2));
+ node = rcu_dereference_raw(*slot);
+ } else {
+ offset = 0;
+ slot = &node->slots[0];
+ for (;;) {
+ node = rcu_dereference_raw(*slot);
+ if (node)
+ break;
+ slot++;
+ offset++;
+ if (offset == RADIX_TREE_MAP_SIZE)
+ return NULL;
+ }
+ }
+ if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0))
+ goto none;
+ iter->shift -= RADIX_TREE_MAP_SHIFT;
+ iter->index = __radix_tree_iter_add(iter, offset);
+ iter->next_index = (iter->index | shift_maxindex(iter->shift)) +
+ 1;
+ }
+
+ return slot;
+ none:
+ iter->next_index = 0;
+ return NULL;
+}
+EXPORT_SYMBOL(__radix_tree_next_slot);
+#endif
+
+void **radix_tree_iter_next(void **slot, struct radix_tree_iter *iter)
+{
+ struct radix_tree_node *node;
+
+ slot++;
+ iter->index = __radix_tree_iter_add(iter, 1);
+ node = rcu_dereference_raw(*slot);
+ __radix_tree_iter_next(&node, slot, iter);
+ iter->next_index = iter->index;
+ iter->tags = 0;
+ return NULL;
+}
+EXPORT_SYMBOL(radix_tree_iter_next);
+
/**
* radix_tree_next_chunk - find next chunk of slots for iteration
*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 728d779..46155d1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1614,8 +1614,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
present++;

if (need_resched()) {
+ slot = radix_tree_iter_next(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
diff --git a/mm/shmem.c b/mm/shmem.c
index 166ebf5..0b3fe33 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -658,8 +658,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
swapped++;

if (need_resched()) {
+ slot = radix_tree_iter_next(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}

@@ -2434,8 +2434,8 @@ static void shmem_tag_pins(struct address_space *mapping)
}

if (need_resched()) {
+ slot = radix_tree_iter_next(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2504,8 +2504,8 @@ static int shmem_wait_for_pins(struct address_space *mapping)
spin_unlock_irq(&mapping->tree_lock);
continue_resched:
if (need_resched()) {
+ slot = radix_tree_iter_next(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c
index df71cb8..6eca4c2 100644
--- a/tools/testing/radix-tree/iteration_check.c
+++ b/tools/testing/radix-tree/iteration_check.c
@@ -79,7 +79,7 @@ static void *tagged_iteration_fn(void *arg)
}

if (rand_r(&seeds[0]) % 50 == 0) {
- slot = radix_tree_iter_next(&iter);
+ slot = radix_tree_iter_next(slot, &iter);
rcu_read_unlock();
rcu_barrier();
rcu_read_lock();
@@ -127,7 +127,7 @@ static void *untagged_iteration_fn(void *arg)
}

if (rand_r(&seeds[1]) % 50 == 0) {
- slot = radix_tree_iter_next(&iter);
+ slot = radix_tree_iter_next(slot, &iter);
rcu_read_unlock();
rcu_barrier();
rcu_read_lock();
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index 25e0463..588209a 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -232,10 +232,14 @@ void multiorder_iteration(void)
int height = order[i] / RADIX_TREE_MAP_SHIFT;
int shift = height * RADIX_TREE_MAP_SHIFT;
int mask = (1 << order[i]) - 1;
+ struct item *item = *slot;

assert(iter.index >= (index[i] &~ mask));
assert(iter.index <= (index[i] | mask));
assert(iter.shift == shift);
+ assert(!radix_tree_is_internal_node(item));
+ assert(item->index >= (index[i] &~ mask));
+ assert(item->index <= (index[i] | mask));
i++;
}
}
@@ -279,12 +283,16 @@ void multiorder_tagged_iteration(void)
}

radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) {
+ struct item *item = *slot;
for (k = i; index[k] < tag_index[i]; k++)
;
mask = (1 << order[k]) - 1;

assert(iter.index >= (tag_index[i] &~ mask));
assert(iter.index <= (tag_index[i] | mask));
+ assert(!radix_tree_is_internal_node(item));
+ assert(item->index >= (tag_index[i] &~ mask));
+ assert(item->index <= (tag_index[i] | mask));
i++;
}
}
@@ -303,12 +311,16 @@ void multiorder_tagged_iteration(void)
}

radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) {
+ struct item *item = *slot;
for (k = i; index[k] < tag_index[i]; k++)
;
mask = (1 << order[k]) - 1;

assert(iter.index >= (tag_index[i] &~ mask));
assert(iter.index <= (tag_index[i] | mask));
+ assert(!radix_tree_is_internal_node(item));
+ assert(item->index >= (tag_index[i] &~ mask));
+ assert(item->index <= (tag_index[i] | mask));
i++;
}
}
diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c
index 1f06ed7..4d28eeb 100644
--- a/tools/testing/radix-tree/regression3.c
+++ b/tools/testing/radix-tree/regression3.c
@@ -88,7 +88,7 @@ void regression3_test(void)
printf("slot %ld %p\n", iter.index, *slot);
if (!iter.index) {
printf("next at %ld\n", iter.index);
- slot = radix_tree_iter_next(&iter);
+ slot = radix_tree_iter_next(slot, &iter);
}
}

@@ -96,7 +96,7 @@ void regression3_test(void)
printf("contig %ld %p\n", iter.index, *slot);
if (!iter.index) {
printf("next at %ld\n", iter.index);
- slot = radix_tree_iter_next(&iter);
+ slot = radix_tree_iter_next(slot, &iter);
}
}

@@ -106,7 +106,7 @@ void regression3_test(void)
printf("tagged %ld %p\n", iter.index, *slot);
if (!iter.index) {
printf("next at %ld\n", iter.index);
- slot = radix_tree_iter_next(&iter);
+ slot = radix_tree_iter_next(slot, &iter);
}
}

diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 33d2b6b..b678f13 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -43,6 +43,7 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag);
extern int nr_allocated;

/* Normally private parts of lib/radix-tree.c */
+struct radix_tree_node *entry_to_node(void *ptr);
void radix_tree_dump(struct radix_tree_root *root);
int root_tag_get(struct radix_tree_root *root, unsigned int tag);
unsigned long node_maxindex(struct radix_tree_node *);
--
2.10.2