Re: [PATCH 20/29] radix tree: Improve multiorder iterators
From: Konstantin Khlebnikov
Date: Fri Nov 18 2016 - 06:47:45 EST
On Thu, Nov 17, 2016 at 3:17 AM, Matthew Wilcox
<mawilcox@xxxxxxxxxxxxxxxxx> wrote:
> From: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
This code still looks overengineered for me.
>
> This fixes several interlinked problems with the iterators in the
> presence of multiorder entries.
>
> 1. radix_tree_iter_next() would only advance by one slot, which would
> result in the iterators returning the same entry more than once if there
> were sibling entries.
Is this a problem? Do we have users who cannot evalate length of entry
by looking into it head?
>
> 2. radix_tree_next_slot() could return an internal pointer instead of
> a user pointer if a tagged multiorder entry was immediately followed by
> an entry of lower order.
>
> 3. radix_tree_next_slot() expanded to a lot more code than it used to
> when multiorder support was compiled in. And I wasn't comfortable with
> entry_to_node() being in a header file.
>
> Fixing radix_tree_iter_next() for the presence of sibling entries
> necessarily involves examining the contents of the radix tree, so we now
> need to pass 'slot' to radix_tree_iter_next(), and we need to change the
> calling convention so it is called *before* dropping the lock which
> protects the tree. Fortunately, unconverted code won't compile.
>
> radix_tree_next_slot() becomes closer to how it looked before multiorder
> support was introduced. It only checks to see if the next entry in the
> chunk is a sibling entry or a pointer to a node; this should be rare
> enough that handling this case out of line is not a performance impact
> (and such impact is amortised by the fact that the entry we just
> processed was a multiorder entry). Also, radix_tree_next_slot() used
> to force a new chunk lookup for untagged entries, which is more
> expensive than the out of line sibling entry
>
> Signed-off-by: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
> ---
> fs/btrfs/tests/btrfs-tests.c | 2 +-
> include/linux/radix-tree.h | 63 ++++++++------------
> lib/radix-tree.c | 94 ++++++++++++++++++++++++++++++
> mm/khugepaged.c | 2 +-
> mm/shmem.c | 6 +-
> tools/testing/radix-tree/iteration_check.c | 4 +-
> tools/testing/radix-tree/multiorder.c | 12 ++++
> tools/testing/radix-tree/regression3.c | 6 +-
> tools/testing/radix-tree/test.h | 1 +
> 9 files changed, 142 insertions(+), 48 deletions(-)
>
> diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
> index 73076a0..6d3457a 100644
> --- a/fs/btrfs/tests/btrfs-tests.c
> +++ b/fs/btrfs/tests/btrfs-tests.c
> @@ -162,7 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
> slot = radix_tree_iter_retry(&iter);
> continue;
> }
> - slot = radix_tree_iter_next(&iter);
> + slot = radix_tree_iter_next(slot, &iter);
> spin_unlock(&fs_info->buffer_lock);
> free_extent_buffer_stale(eb);
> spin_lock(&fs_info->buffer_lock);
> diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
> index 66fb8c0..36c6175 100644
> --- a/include/linux/radix-tree.h
> +++ b/include/linux/radix-tree.h
> @@ -424,15 +424,10 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
> *
> * If the iterator needs to release then reacquire a lock, the chunk may
> * have been invalidated by an insertion or deletion. Call this function
> - * to continue the iteration from the next index.
> + * before releasing the lock to continue the iteration from the next index.
> */
> -static inline __must_check
> -void **radix_tree_iter_next(struct radix_tree_iter *iter)
> -{
> - iter->next_index = __radix_tree_iter_add(iter, 1);
> - iter->tags = 0;
> - return NULL;
> -}
> +void **__must_check radix_tree_iter_next(void **slot,
> + struct radix_tree_iter *iter);
>
> /**
> * radix_tree_chunk_size - get current chunk size
> @@ -446,10 +441,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
> return (iter->next_index - iter->index) >> iter_shift(iter);
> }
>
> -static inline struct radix_tree_node *entry_to_node(void *ptr)
> +#ifdef CONFIG_RADIX_TREE_MULTIORDER
> +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
> + unsigned flags);
> +#else
> +/* Can't happen without sibling entries, but the compiler can't tell that */
> +static inline void ** __radix_tree_next_slot(void **slot,
> + struct radix_tree_iter *iter, unsigned flags)
> {
> - return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
> + return slot;
> }
> +#endif
>
> /**
> * radix_tree_next_slot - find next slot in chunk
> @@ -474,51 +476,31 @@ static __always_inline void **
> radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
> {
> if (flags & RADIX_TREE_ITER_TAGGED) {
> - void *canon = slot;
> -
> iter->tags >>= 1;
> if (unlikely(!iter->tags))
> return NULL;
> - while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
> - radix_tree_is_internal_node(slot[1])) {
> - if (entry_to_node(slot[1]) == canon) {
> - iter->tags >>= 1;
> - iter->index = __radix_tree_iter_add(iter, 1);
> - slot++;
> - continue;
> - }
> - iter->next_index = __radix_tree_iter_add(iter, 1);
> - return NULL;
> - }
> if (likely(iter->tags & 1ul)) {
> iter->index = __radix_tree_iter_add(iter, 1);
> - return slot + 1;
> + slot++;
> + goto found;
> }
> if (!(flags & RADIX_TREE_ITER_CONTIG)) {
> unsigned offset = __ffs(iter->tags);
>
> - iter->tags >>= offset;
> - iter->index = __radix_tree_iter_add(iter, offset + 1);
> - return slot + offset + 1;
> + iter->tags >>= offset++;
> + iter->index = __radix_tree_iter_add(iter, offset);
> + slot += offset;
> + goto found;
> }
> } else {
> long count = radix_tree_chunk_size(iter);
> - void *canon = slot;
>
> while (--count > 0) {
> slot++;
> iter->index = __radix_tree_iter_add(iter, 1);
>
> - if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
> - radix_tree_is_internal_node(*slot)) {
> - if (entry_to_node(*slot) == canon)
> - continue;
> - iter->next_index = iter->index;
> - break;
> - }
> -
> if (likely(*slot))
> - return slot;
> + goto found;
> if (flags & RADIX_TREE_ITER_CONTIG) {
> /* forbid switching to the next chunk */
> iter->next_index = 0;
> @@ -527,6 +509,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
> }
> }
> return NULL;
> +
> + found:
> + if (unlikely(radix_tree_is_internal_node(*slot)))
> + return __radix_tree_next_slot(slot, iter, flags);
> + return slot;
> }
>
> /**
> @@ -577,6 +564,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
> slot || (slot = radix_tree_next_chunk(root, iter, \
> RADIX_TREE_ITER_TAGGED | tag)) ; \
> slot = radix_tree_next_slot(slot, iter, \
> - RADIX_TREE_ITER_TAGGED))
> + RADIX_TREE_ITER_TAGGED | tag))
>
> #endif /* _LINUX_RADIX_TREE_H */
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index 09c5f1d..27b53ef 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -69,6 +69,11 @@ struct radix_tree_preload {
> };
> static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
>
> +static inline struct radix_tree_node *entry_to_node(void *ptr)
> +{
> + return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
> +}
> +
> static inline void *node_to_entry(void *ptr)
> {
> return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
> @@ -1138,6 +1143,95 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter,
> #endif
> }
>
> +static void ** __radix_tree_iter_next(struct radix_tree_node **nodep,
> + void **slot, struct radix_tree_iter *iter)
> +{
> + void *sib = node_to_entry(slot - 1);
> +
> + while (iter->index < iter->next_index) {
> + *nodep = rcu_dereference_raw(*slot);
> + if (*nodep && *nodep != sib)
> + return slot;
> + slot++;
> + iter->index = __radix_tree_iter_add(iter, 1);
> + iter->tags >>= 1;
> + }
> +
> + *nodep = NULL;
> + return NULL;
> +}
> +
> +#ifdef CONFIG_RADIX_TREE_MULTIORDER
> +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
> + unsigned flags)
> +{
> + unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
> + struct radix_tree_node *node = rcu_dereference_raw(*slot);
> +
> + slot = __radix_tree_iter_next(&node, slot, iter);
> +
> + while (radix_tree_is_internal_node(node)) {
> + unsigned offset;
> +
> + if (node == RADIX_TREE_RETRY)
> + return slot;
> + node = entry_to_node(node);
> +
> + if (flags & RADIX_TREE_ITER_TAGGED) {
> + unsigned tag_long, tag_bit;
> + offset = radix_tree_find_next_bit(node, tag, 0);
> + if (offset == RADIX_TREE_MAP_SIZE)
> + return NULL;
> + slot = &node->slots[offset];
> +
> + tag_long = offset / BITS_PER_LONG;
> + tag_bit = offset % BITS_PER_LONG;
> + iter->tags = node->tags[tag][tag_long] >> tag_bit;
> + BUG_ON(iter->tags >= (RADIX_TREE_MAP_SIZE * 2));
> + node = rcu_dereference_raw(*slot);
> + } else {
> + offset = 0;
> + slot = &node->slots[0];
> + for (;;) {
> + node = rcu_dereference_raw(*slot);
> + if (node)
> + break;
> + slot++;
> + offset++;
> + if (offset == RADIX_TREE_MAP_SIZE)
> + return NULL;
> + }
> + }
> + if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0))
> + goto none;
> + iter->shift -= RADIX_TREE_MAP_SHIFT;
> + iter->index = __radix_tree_iter_add(iter, offset);
> + iter->next_index = (iter->index | shift_maxindex(iter->shift)) +
> + 1;
> + }
> +
> + return slot;
> + none:
> + iter->next_index = 0;
> + return NULL;
> +}
> +EXPORT_SYMBOL(__radix_tree_next_slot);
> +#endif
> +
> +void **radix_tree_iter_next(void **slot, struct radix_tree_iter *iter)
> +{
> + struct radix_tree_node *node;
> +
> + slot++;
> + iter->index = __radix_tree_iter_add(iter, 1);
> + node = rcu_dereference_raw(*slot);
> + __radix_tree_iter_next(&node, slot, iter);
> + iter->next_index = iter->index;
> + iter->tags = 0;
> + return NULL;
> +}
> +EXPORT_SYMBOL(radix_tree_iter_next);
> +
> /**
> * radix_tree_next_chunk - find next chunk of slots for iteration
> *
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 728d779..46155d1 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1614,8 +1614,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
> present++;
>
> if (need_resched()) {
> + slot = radix_tree_iter_next(slot, &iter);
> cond_resched_rcu();
> - slot = radix_tree_iter_next(&iter);
> }
> }
> rcu_read_unlock();
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 166ebf5..0b3fe33 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -658,8 +658,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
> swapped++;
>
> if (need_resched()) {
> + slot = radix_tree_iter_next(slot, &iter);
> cond_resched_rcu();
> - slot = radix_tree_iter_next(&iter);
> }
> }
>
> @@ -2434,8 +2434,8 @@ static void shmem_tag_pins(struct address_space *mapping)
> }
>
> if (need_resched()) {
> + slot = radix_tree_iter_next(slot, &iter);
> cond_resched_rcu();
> - slot = radix_tree_iter_next(&iter);
> }
> }
> rcu_read_unlock();
> @@ -2504,8 +2504,8 @@ static int shmem_wait_for_pins(struct address_space *mapping)
> spin_unlock_irq(&mapping->tree_lock);
> continue_resched:
> if (need_resched()) {
> + slot = radix_tree_iter_next(slot, &iter);
> cond_resched_rcu();
> - slot = radix_tree_iter_next(&iter);
> }
> }
> rcu_read_unlock();
> diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c
> index df71cb8..6eca4c2 100644
> --- a/tools/testing/radix-tree/iteration_check.c
> +++ b/tools/testing/radix-tree/iteration_check.c
> @@ -79,7 +79,7 @@ static void *tagged_iteration_fn(void *arg)
> }
>
> if (rand_r(&seeds[0]) % 50 == 0) {
> - slot = radix_tree_iter_next(&iter);
> + slot = radix_tree_iter_next(slot, &iter);
> rcu_read_unlock();
> rcu_barrier();
> rcu_read_lock();
> @@ -127,7 +127,7 @@ static void *untagged_iteration_fn(void *arg)
> }
>
> if (rand_r(&seeds[1]) % 50 == 0) {
> - slot = radix_tree_iter_next(&iter);
> + slot = radix_tree_iter_next(slot, &iter);
> rcu_read_unlock();
> rcu_barrier();
> rcu_read_lock();
> diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
> index 25e0463..588209a 100644
> --- a/tools/testing/radix-tree/multiorder.c
> +++ b/tools/testing/radix-tree/multiorder.c
> @@ -232,10 +232,14 @@ void multiorder_iteration(void)
> int height = order[i] / RADIX_TREE_MAP_SHIFT;
> int shift = height * RADIX_TREE_MAP_SHIFT;
> int mask = (1 << order[i]) - 1;
> + struct item *item = *slot;
>
> assert(iter.index >= (index[i] &~ mask));
> assert(iter.index <= (index[i] | mask));
> assert(iter.shift == shift);
> + assert(!radix_tree_is_internal_node(item));
> + assert(item->index >= (index[i] &~ mask));
> + assert(item->index <= (index[i] | mask));
> i++;
> }
> }
> @@ -279,12 +283,16 @@ void multiorder_tagged_iteration(void)
> }
>
> radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) {
> + struct item *item = *slot;
> for (k = i; index[k] < tag_index[i]; k++)
> ;
> mask = (1 << order[k]) - 1;
>
> assert(iter.index >= (tag_index[i] &~ mask));
> assert(iter.index <= (tag_index[i] | mask));
> + assert(!radix_tree_is_internal_node(item));
> + assert(item->index >= (tag_index[i] &~ mask));
> + assert(item->index <= (tag_index[i] | mask));
> i++;
> }
> }
> @@ -303,12 +311,16 @@ void multiorder_tagged_iteration(void)
> }
>
> radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) {
> + struct item *item = *slot;
> for (k = i; index[k] < tag_index[i]; k++)
> ;
> mask = (1 << order[k]) - 1;
>
> assert(iter.index >= (tag_index[i] &~ mask));
> assert(iter.index <= (tag_index[i] | mask));
> + assert(!radix_tree_is_internal_node(item));
> + assert(item->index >= (tag_index[i] &~ mask));
> + assert(item->index <= (tag_index[i] | mask));
> i++;
> }
> }
> diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c
> index 1f06ed7..4d28eeb 100644
> --- a/tools/testing/radix-tree/regression3.c
> +++ b/tools/testing/radix-tree/regression3.c
> @@ -88,7 +88,7 @@ void regression3_test(void)
> printf("slot %ld %p\n", iter.index, *slot);
> if (!iter.index) {
> printf("next at %ld\n", iter.index);
> - slot = radix_tree_iter_next(&iter);
> + slot = radix_tree_iter_next(slot, &iter);
> }
> }
>
> @@ -96,7 +96,7 @@ void regression3_test(void)
> printf("contig %ld %p\n", iter.index, *slot);
> if (!iter.index) {
> printf("next at %ld\n", iter.index);
> - slot = radix_tree_iter_next(&iter);
> + slot = radix_tree_iter_next(slot, &iter);
> }
> }
>
> @@ -106,7 +106,7 @@ void regression3_test(void)
> printf("tagged %ld %p\n", iter.index, *slot);
> if (!iter.index) {
> printf("next at %ld\n", iter.index);
> - slot = radix_tree_iter_next(&iter);
> + slot = radix_tree_iter_next(slot, &iter);
> }
> }
>
> diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
> index 33d2b6b..b678f13 100644
> --- a/tools/testing/radix-tree/test.h
> +++ b/tools/testing/radix-tree/test.h
> @@ -43,6 +43,7 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag);
> extern int nr_allocated;
>
> /* Normally private parts of lib/radix-tree.c */
> +struct radix_tree_node *entry_to_node(void *ptr);
> void radix_tree_dump(struct radix_tree_root *root);
> int root_tag_get(struct radix_tree_root *root, unsigned int tag);
> unsigned long node_maxindex(struct radix_tree_node *);
> --
> 2.10.2
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxxx For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>