Re: [PATCH v5 3/3] z3fold: add shrinker

From: Dan Streetman
Date: Mon Oct 17 2016 - 22:08:41 EST


On Sat, Oct 15, 2016 at 8:05 AM, Vitaly Wool <vitalywool@xxxxxxxxx> wrote:
> This patch implements shrinker for z3fold. This shrinker
> implementation does not free up any pages directly but it allows
> for a denser placement of compressed objects which results in
> less actual pages consumed and higher compression ratio therefore.
>
> This update removes z3fold page compaction from the freeing path
> since we can rely on shrinker to do the job. Also, a new flag
> UNDER_COMPACTION is introduced to protect against two threads
> trying to compact the same page.

i'm completely unconvinced that this should be a shrinker. The
alloc/free paths are much, much better suited to compacting a page
than a shrinker that must scan through all the unbuddied pages. Why
not just improve compaction for the alloc/free paths?


>
> This patch has been checked with the latest Linus's tree.
>
> Signed-off-by: Vitaly Wool <vitalywool@xxxxxxxxx>
> ---
> mm/z3fold.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++-----------
> 1 file changed, 119 insertions(+), 25 deletions(-)
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 10513b5..8f84d3c 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -27,6 +27,7 @@
> #include <linux/mm.h>
> #include <linux/module.h>
> #include <linux/preempt.h>
> +#include <linux/shrinker.h>
> #include <linux/slab.h>
> #include <linux/spinlock.h>
> #include <linux/zpool.h>
> @@ -72,6 +73,7 @@ struct z3fold_ops {
> * @unbuddied_nr: number of unbuddied z3fold pages in the pool.
> * @ops: pointer to a structure of user defined operations specified at
> * pool creation time.
> + * @shrinker: shrinker structure to optimize page layout in background
> *
> * This structure is allocated at pool creation time and maintains metadata
> * pertaining to a particular z3fold pool.
> @@ -86,6 +88,7 @@ struct z3fold_pool {
> const struct z3fold_ops *ops;
> struct zpool *zpool;
> const struct zpool_ops *zpool_ops;
> + struct shrinker shrinker;
> };
>
> enum buddy {
> @@ -121,6 +124,7 @@ enum z3fold_page_flags {
> UNDER_RECLAIM = 0,
> PAGE_HEADLESS,
> MIDDLE_CHUNK_MAPPED,
> + UNDER_COMPACTION,
> };
>
> /*****************
> @@ -136,6 +140,9 @@ static int size_to_chunks(size_t size)
> #define for_each_unbuddied_list(_iter, _begin) \
> for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
>
> +#define for_each_unbuddied_list_down(_iter, _end) \
> + for ((_iter) = (_end); (_iter) > 0; (_iter)--)
> +

bikeshed: the conventional suffix is _reverse, not _down, i.e.
for_each_unbuddied_list_reverse()

> /* Initializes the z3fold header of a newly allocated z3fold page */
> static struct z3fold_header *init_z3fold_page(struct page *page)
> {
> @@ -145,6 +152,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
> clear_bit(UNDER_RECLAIM, &page->private);
> clear_bit(PAGE_HEADLESS, &page->private);
> clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
> + clear_bit(UNDER_COMPACTION, &page->private);
>
> zhdr->first_chunks = 0;
> zhdr->middle_chunks = 0;
> @@ -211,6 +219,103 @@ static int num_free_chunks(struct z3fold_header *zhdr)
> return nfree;
> }
>
> +/* Has to be called with lock held */
> +static int z3fold_compact_page(struct z3fold_header *zhdr, bool sync)
> +{
> + struct page *page = virt_to_page(zhdr);
> + void *beg = zhdr;
> +
> +
> + if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
> + !test_bit(UNDER_RECLAIM, &page->private) &&
> + !test_bit(UNDER_COMPACTION, &page->private)) {
> + set_bit(UNDER_COMPACTION, &page->private);

i assume the reclaim check, and new compaction bit check, is due to
the removal of the spinlock, which was incorrect. changing to a
per-page spinlock may be the best way to handle this, but flags are
absolutely not appropriate - they don't provide the needed locking.
Even if the compaction bit was the only locking needed (which it
isn't), it still isn't correct here - while extremely unlikely, it's
still possible for multiple threads to race between checking the
compaction bit, and setting it. That's what test_and_set_bit() is
for.

> + if (zhdr->middle_chunks != 0 &&

only need to check if the middle bud is in use once; if it isn't,
there's no compaction to do.

> + zhdr->first_chunks == 0 &&
> + zhdr->last_chunks == 0) {
> + memmove(beg + ZHDR_SIZE_ALIGNED,
> + beg + (zhdr->start_middle << CHUNK_SHIFT),
> + zhdr->middle_chunks << CHUNK_SHIFT);
> + zhdr->first_chunks = zhdr->middle_chunks;
> + zhdr->middle_chunks = 0;
> + zhdr->start_middle = 0;
> + zhdr->first_num++;
> + clear_bit(UNDER_COMPACTION, &page->private);
> + return 1;
> + }
> + if (sync)
> + goto out;

i don't get it, why is that compaction synchronous while the others
below aren't?

> +
> + /* moving data is expensive, so let's only do that if
> + * there's substantial gain (2+ chunks)

"at least 2 chunks" feels arbitrary...it should be a #define instead
of a magic number...

> + */
> + if (zhdr->middle_chunks != 0 && zhdr->first_chunks != 0 &&
> + zhdr->last_chunks == 0 &&
> + zhdr->start_middle > zhdr->first_chunks + 2) {
> + unsigned short new_start = zhdr->first_chunks + 1;
> + memmove(beg + (new_start << CHUNK_SHIFT),
> + beg + (zhdr->start_middle << CHUNK_SHIFT),
> + zhdr->middle_chunks << CHUNK_SHIFT);
> + zhdr->start_middle = new_start;
> + clear_bit(UNDER_COMPACTION, &page->private);
> + return 1;
> + }
> + if (zhdr->middle_chunks != 0 && zhdr->last_chunks != 0 &&
> + zhdr->first_chunks == 0 &&
> + zhdr->middle_chunks + zhdr->last_chunks <=
> + NCHUNKS - zhdr->start_middle - 2) {
> + unsigned short new_start = NCHUNKS - zhdr->last_chunks -
> + zhdr->middle_chunks;
> + memmove(beg + (new_start << CHUNK_SHIFT),
> + beg + (zhdr->start_middle << CHUNK_SHIFT),
> + zhdr->middle_chunks << CHUNK_SHIFT);

these if clauses, and the memmoves, aren't very readable, could it be
made any better with a separate function?

> + zhdr->start_middle = new_start;
> + clear_bit(UNDER_COMPACTION, &page->private);
> + return 1;
> + }
> + }
> +out:
> + clear_bit(UNDER_COMPACTION, &page->private);
> + return 0;
> +}
> +
> +static unsigned long z3fold_shrink_count(struct shrinker *shrink,
> + struct shrink_control *sc)
> +{
> + struct z3fold_pool *pool = container_of(shrink, struct z3fold_pool,
> + shrinker);
> +
> + return atomic64_read(&pool->unbuddied_nr);
> +}
> +
> +static unsigned long z3fold_shrink_scan(struct shrinker *shrink,
> + struct shrink_control *sc)
> +{
> + struct z3fold_pool *pool = container_of(shrink, struct z3fold_pool,
> + shrinker);
> + struct z3fold_header *zhdr;
> + int i, nr_to_scan = sc->nr_to_scan, nr_shrunk = 0;
> +
> + spin_lock(&pool->lock);
> + for_each_unbuddied_list_down(i, NCHUNKS - 3) {
> + if (!list_empty(&pool->unbuddied[i])) {
> + zhdr = list_first_entry(&pool->unbuddied[i],
> + struct z3fold_header, buddy);
> + list_del(&zhdr->buddy);
> + spin_unlock(&pool->lock);
> + nr_shrunk += z3fold_compact_page(zhdr, false);
> + spin_lock(&pool->lock);
> + list_add(&zhdr->buddy,
> + &pool->unbuddied[num_free_chunks(zhdr)]);

use list_add_tail(), we just compacted it and putting it at the head
of the new unbuddied list will cause it to be unnecessarily scanned
first later.

> + if (!--nr_to_scan)
> + break;
> + }
> + }
> + spin_unlock(&pool->lock);
> + return nr_shrunk;
> +}
> +
> +
> /*****************
> * API Functions
> *****************/
> @@ -230,7 +335,7 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
>
> pool = kzalloc(sizeof(struct z3fold_pool), gfp);
> if (!pool)
> - return NULL;
> + goto out;
> spin_lock_init(&pool->lock);
> for_each_unbuddied_list(i, 0)
> INIT_LIST_HEAD(&pool->unbuddied[i]);
> @@ -238,8 +343,19 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
> INIT_LIST_HEAD(&pool->lru);
> atomic64_set(&pool->pages_nr, 0);
> atomic64_set(&pool->unbuddied_nr, 0);
> + pool->shrinker.count_objects = z3fold_shrink_count;
> + pool->shrinker.scan_objects = z3fold_shrink_scan;
> + pool->shrinker.seeks = DEFAULT_SEEKS;
> + pool->shrinker.batch = NCHUNKS - 4;
> + if (register_shrinker(&pool->shrinker))
> + goto out_free;
> pool->ops = ops;
> return pool;
> +
> +out_free:
> + kfree(pool);
> +out:
> + return NULL;
> }
>
> /**
> @@ -250,31 +366,10 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
> */
> static void z3fold_destroy_pool(struct z3fold_pool *pool)
> {
> + unregister_shrinker(&pool->shrinker);
> kfree(pool);
> }
>
> -/* Has to be called with lock held */
> -static int z3fold_compact_page(struct z3fold_header *zhdr)
> -{
> - struct page *page = virt_to_page(zhdr);
> - void *beg = zhdr;
> -
> -
> - if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
> - zhdr->middle_chunks != 0 &&
> - zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
> - memmove(beg + ZHDR_SIZE_ALIGNED,
> - beg + (zhdr->start_middle << CHUNK_SHIFT),
> - zhdr->middle_chunks << CHUNK_SHIFT);
> - zhdr->first_chunks = zhdr->middle_chunks;
> - zhdr->middle_chunks = 0;
> - zhdr->start_middle = 0;
> - zhdr->first_num++;
> - return 1;
> - }
> - return 0;
> -}
> -
> /**
> * z3fold_alloc() - allocates a region of a given size
> * @pool: z3fold pool from which to allocate
> @@ -464,7 +559,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
> free_z3fold_page(zhdr);
> atomic64_dec(&pool->pages_nr);
> } else {
> - z3fold_compact_page(zhdr);

why remove this?

> /* Add to the unbuddied list */
> freechunks = num_free_chunks(zhdr);
> list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
> @@ -596,7 +690,7 @@ next:
> /* Full, add to buddied list */
> list_add(&zhdr->buddy, &pool->buddied);
> } else {
> - z3fold_compact_page(zhdr);
> + z3fold_compact_page(zhdr, true);
> /* add to unbuddied list */
> freechunks = num_free_chunks(zhdr);
> list_add(&zhdr->buddy,
> --
> 2.4.2