Re: [PATCH v3 12/16] zsmalloc: zs_compact refactoring

From: Minchan Kim
Date: Mon Apr 04 2016 - 05:01:15 EST


Hello Chulmin,

On Mon, Apr 04, 2016 at 05:04:16PM +0900, Chulmin Kim wrote:
> On 2016ë 03ì 30ì 16:12, Minchan Kim wrote:
> >Currently, we rely on class->lock to prevent zspage destruction.
> >It was okay until now because the critical section is short but
> >with run-time migration, it could be long so class->lock is not
> >a good apporach any more.
> >
> >So, this patch introduces [un]freeze_zspage functions which
> >freeze allocated objects in the zspage with pinning tag so
> >user cannot free using object. With those functions, this patch
> >redesign compaction.
> >
> >Those functions will be used for implementing zspage runtime
> >migrations, too.
> >
> >Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
> >---
> > mm/zsmalloc.c | 393 ++++++++++++++++++++++++++++++++++++++--------------------
> > 1 file changed, 257 insertions(+), 136 deletions(-)
> >
> >diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> >index b11dcd718502..ac8ca7b10720 100644
> >--- a/mm/zsmalloc.c
> >+++ b/mm/zsmalloc.c
> >@@ -922,6 +922,13 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page,
> > return *(unsigned long *)obj;
> > }
> >
> >+static inline int testpin_tag(unsigned long handle)
> >+{
> >+ unsigned long *ptr = (unsigned long *)handle;
> >+
> >+ return test_bit(HANDLE_PIN_BIT, ptr);
> >+}
> >+
> > static inline int trypin_tag(unsigned long handle)
> > {
> > unsigned long *ptr = (unsigned long *)handle;
> >@@ -949,8 +956,7 @@ static void reset_page(struct page *page)
> > page->freelist = NULL;
> > }
> >
> >-static void free_zspage(struct zs_pool *pool, struct size_class *class,
> >- struct page *first_page)
> >+static void free_zspage(struct zs_pool *pool, struct page *first_page)
> > {
> > struct page *nextp, *tmp, *head_extra;
> >
> >@@ -973,11 +979,6 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
> > }
> > reset_page(head_extra);
> > __free_page(head_extra);
> >-
> >- zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
> >- class->size, class->pages_per_zspage));
> >- atomic_long_sub(class->pages_per_zspage,
> >- &pool->pages_allocated);
> > }
> >
> > /* Initialize a newly allocated zspage */
> >@@ -1325,6 +1326,11 @@ static bool zspage_full(struct size_class *class, struct page *first_page)
> > return get_zspage_inuse(first_page) == class->objs_per_zspage;
> > }
> >
> >+static bool zspage_empty(struct size_class *class, struct page *first_page)
> >+{
> >+ return get_zspage_inuse(first_page) == 0;
> >+}
> >+
> > unsigned long zs_get_total_pages(struct zs_pool *pool)
> > {
> > return atomic_long_read(&pool->pages_allocated);
> >@@ -1455,7 +1461,6 @@ static unsigned long obj_malloc(struct size_class *class,
> > set_page_private(first_page, handle | OBJ_ALLOCATED_TAG);
> > kunmap_atomic(vaddr);
> > mod_zspage_inuse(first_page, 1);
> >- zs_stat_inc(class, OBJ_USED, 1);
> >
> > obj = location_to_obj(m_page, obj);
> >
> >@@ -1510,6 +1515,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> > }
> >
> > obj = obj_malloc(class, first_page, handle);
> >+ zs_stat_inc(class, OBJ_USED, 1);
> > /* Now move the zspage to another fullness group, if required */
> > fix_fullness_group(class, first_page);
> > record_obj(handle, obj);
> >@@ -1540,7 +1546,6 @@ static void obj_free(struct size_class *class, unsigned long obj)
> > kunmap_atomic(vaddr);
> > set_freeobj(first_page, f_objidx);
> > mod_zspage_inuse(first_page, -1);
> >- zs_stat_dec(class, OBJ_USED, 1);
> > }
> >
> > void zs_free(struct zs_pool *pool, unsigned long handle)
> >@@ -1564,10 +1569,19 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> >
> > spin_lock(&class->lock);
> > obj_free(class, obj);
> >+ zs_stat_dec(class, OBJ_USED, 1);
> > fullness = fix_fullness_group(class, first_page);
> >- if (fullness == ZS_EMPTY)
> >- free_zspage(pool, class, first_page);
> >+ if (fullness == ZS_EMPTY) {
> >+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
> >+ class->size, class->pages_per_zspage));
> >+ spin_unlock(&class->lock);
> >+ atomic_long_sub(class->pages_per_zspage,
> >+ &pool->pages_allocated);
> >+ free_zspage(pool, first_page);
> >+ goto out;
> >+ }
> > spin_unlock(&class->lock);
> >+out:
> > unpin_tag(handle);
> >
> > free_handle(pool, handle);
> >@@ -1637,127 +1651,66 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
> > kunmap_atomic(s_addr);
> > }
> >
> >-/*
> >- * Find alloced object in zspage from index object and
> >- * return handle.
> >- */
> >-static unsigned long find_alloced_obj(struct size_class *class,
> >- struct page *page, int index)
> >+static unsigned long handle_from_obj(struct size_class *class,
> >+ struct page *first_page, int obj_idx)
> > {
> >- unsigned long head;
> >- int offset = 0;
> >- unsigned long handle = 0;
> >- void *addr = kmap_atomic(page);
> >-
> >- if (!is_first_page(page))
> >- offset = page->index;
> >- offset += class->size * index;
> >-
> >- while (offset < PAGE_SIZE) {
> >- head = obj_to_head(class, page, addr + offset);
> >- if (head & OBJ_ALLOCATED_TAG) {
> >- handle = head & ~OBJ_ALLOCATED_TAG;
> >- if (trypin_tag(handle))
> >- break;
> >- handle = 0;
> >- }
> >+ struct page *page;
> >+ unsigned long offset_in_page;
> >+ void *addr;
> >+ unsigned long head, handle = 0;
> >
> >- offset += class->size;
> >- index++;
> >- }
> >+ objidx_to_page_and_offset(class, first_page, obj_idx,
> >+ &page, &offset_in_page);
> >
> >+ addr = kmap_atomic(page);
> >+ head = obj_to_head(class, page, addr + offset_in_page);
> >+ if (head & OBJ_ALLOCATED_TAG)
> >+ handle = head & ~OBJ_ALLOCATED_TAG;
> > kunmap_atomic(addr);
> >+
> > return handle;
> > }
> >
> >-struct zs_compact_control {
> >- /* Source page for migration which could be a subpage of zspage. */
> >- struct page *s_page;
> >- /* Destination page for migration which should be a first page
> >- * of zspage. */
> >- struct page *d_page;
> >- /* Starting object index within @s_page which used for live object
> >- * in the subpage. */
> >- int index;
> >-};
> >-
> >-static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
> >- struct zs_compact_control *cc)
> >+static int migrate_zspage(struct size_class *class, struct page *dst_page,
> >+ struct page *src_page)
> > {
> >- unsigned long used_obj, free_obj;
> > unsigned long handle;
> >- struct page *s_page = cc->s_page;
> >- struct page *d_page = cc->d_page;
> >- unsigned long index = cc->index;
> >- int ret = 0;
> >+ unsigned long old_obj, new_obj;
> >+ int i;
> >+ int nr_migrated = 0;
> >
> >- while (1) {
> >- handle = find_alloced_obj(class, s_page, index);
> >- if (!handle) {
> >- s_page = get_next_page(s_page);
> >- if (!s_page)
> >- break;
> >- index = 0;
> >+ for (i = 0; i < class->objs_per_zspage; i++) {
> >+ handle = handle_from_obj(class, src_page, i);
> >+ if (!handle)
> > continue;
> >- }
> >-
> >- /* Stop if there is no more space */
> >- if (zspage_full(class, d_page)) {
> >- unpin_tag(handle);
> >- ret = -ENOMEM;
> >+ if (zspage_full(class, dst_page))
> > break;
> >- }
> >-
> >- used_obj = handle_to_obj(handle);
> >- free_obj = obj_malloc(class, d_page, handle);
> >- zs_object_copy(class, free_obj, used_obj);
> >- index++;
> >+ old_obj = handle_to_obj(handle);
> >+ new_obj = obj_malloc(class, dst_page, handle);
> >+ zs_object_copy(class, new_obj, old_obj);
> >+ nr_migrated++;
> > /*
> > * record_obj updates handle's value to free_obj and it will
> > * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
> > * breaks synchronization using pin_tag(e,g, zs_free) so
> > * let's keep the lock bit.
> > */
> >- free_obj |= BIT(HANDLE_PIN_BIT);
> >- record_obj(handle, free_obj);
> >- unpin_tag(handle);
> >- obj_free(class, used_obj);
> >+ new_obj |= BIT(HANDLE_PIN_BIT);
> >+ record_obj(handle, new_obj);
> >+ obj_free(class, old_obj);
> > }
> >-
> >- /* Remember last position in this iteration */
> >- cc->s_page = s_page;
> >- cc->index = index;
> >-
> >- return ret;
> >-}
> >-
> >-static struct page *isolate_target_page(struct size_class *class)
> >-{
> >- int i;
> >- struct page *page;
> >-
> >- for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
> >- page = class->fullness_list[i];
> >- if (page) {
> >- remove_zspage(class, i, page);
> >- break;
> >- }
> >- }
> >-
> >- return page;
> >+ return nr_migrated;
> > }
> >
> > /*
> > * putback_zspage - add @first_page into right class's fullness list
> >- * @pool: target pool
> > * @class: destination class
> > * @first_page: target page
> > *
> > * Return @first_page's updated fullness_group
> > */
> >-static enum fullness_group putback_zspage(struct zs_pool *pool,
> >- struct size_class *class,
> >- struct page *first_page)
> >+static enum fullness_group putback_zspage(struct size_class *class,
> >+ struct page *first_page)
> > {
> > enum fullness_group fullness;
> >
> >@@ -1768,17 +1721,155 @@ static enum fullness_group putback_zspage(struct zs_pool *pool,
> > return fullness;
> > }
> >
> >+/*
> >+ * freeze_zspage - freeze all objects in a zspage
> >+ * @class: size class of the page
> >+ * @first_page: first page of zspage
> >+ *
> >+ * Freeze all allocated objects in a zspage so objects couldn't be
> >+ * freed until unfreeze objects. It should be called under class->lock.
> >+ *
> >+ * RETURNS:
> >+ * the number of pinned objects
> >+ */
> >+static int freeze_zspage(struct size_class *class, struct page *first_page)
> >+{
> >+ unsigned long obj_idx;
> >+ struct page *obj_page;
> >+ unsigned long offset;
> >+ void *addr;
> >+ int nr_freeze = 0;
> >+
> >+ for (obj_idx = 0; obj_idx < class->objs_per_zspage; obj_idx++) {
> >+ unsigned long head;
> >+
> >+ objidx_to_page_and_offset(class, first_page, obj_idx,
> >+ &obj_page, &offset);
> >+ addr = kmap_atomic(obj_page);
> >+ head = obj_to_head(class, obj_page, addr + offset);
> >+ if (head & OBJ_ALLOCATED_TAG) {
> >+ unsigned long handle = head & ~OBJ_ALLOCATED_TAG;
> >+
> >+ if (!trypin_tag(handle)) {
> >+ kunmap_atomic(addr);
> >+ break;
> >+ }
> >+ nr_freeze++;
> >+ }
> >+ kunmap_atomic(addr);
> >+ }
> >+
> >+ return nr_freeze;
> >+}
> >+
> >+/*
> >+ * unfreeze_page - unfreeze objects freezed by freeze_zspage in a zspage
> >+ * @class: size class of the page
> >+ * @first_page: freezed zspage to unfreeze
> >+ * @nr_obj: the number of objects to unfreeze
> >+ *
> >+ * unfreeze objects in a zspage.
> >+ */
> >+static void unfreeze_zspage(struct size_class *class, struct page *first_page,
> >+ int nr_obj)
> >+{
> >+ unsigned long obj_idx;
> >+ struct page *obj_page;
> >+ unsigned long offset;
> >+ void *addr;
> >+ int nr_unfreeze = 0;
> >+
> >+ for (obj_idx = 0; obj_idx < class->objs_per_zspage &&
> >+ nr_unfreeze < nr_obj; obj_idx++) {
> >+ unsigned long head;
> >+
> >+ objidx_to_page_and_offset(class, first_page, obj_idx,
> >+ &obj_page, &offset);
> >+ addr = kmap_atomic(obj_page);
> >+ head = obj_to_head(class, obj_page, addr + offset);
> >+ if (head & OBJ_ALLOCATED_TAG) {
> >+ unsigned long handle = head & ~OBJ_ALLOCATED_TAG;
> >+
> >+ VM_BUG_ON(!testpin_tag(handle));
> >+ unpin_tag(handle);
> >+ nr_unfreeze++;
> >+ }
> >+ kunmap_atomic(addr);
> >+ }
> >+}
> >+
> >+/*
> >+ * isolate_source_page - isolate a zspage for migration source
> >+ * @class: size class of zspage for isolation
> >+ *
> >+ * Returns a zspage which are isolated from list so anyone can
> >+ * allocate a object from that page. As well, freeze all objects
> >+ * allocated in the zspage so anyone cannot access that objects
> >+ * (e.g., zs_map_object, zs_free).
> >+ */
> > static struct page *isolate_source_page(struct size_class *class)
> > {
> > int i;
> > struct page *page = NULL;
> >
> > for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
> >+ int inuse, freezed;
> >+
> > page = class->fullness_list[i];
> > if (!page)
> > continue;
> >
> > remove_zspage(class, i, page);
> >+
> >+ inuse = get_zspage_inuse(page);
> >+ freezed = freeze_zspage(class, page);
> >+
> >+ if (inuse != freezed) {
> >+ unfreeze_zspage(class, page, freezed);
> >+ putback_zspage(class, page);
> >+ page = NULL;
> >+ continue;
> >+ }
> >+
> >+ break;
> >+ }
> >+
> >+ return page;
> >+}
> >+
> >+/*
> >+ * isolate_target_page - isolate a zspage for migration target
> >+ * @class: size class of zspage for isolation
> >+ *
> >+ * Returns a zspage which are isolated from list so anyone can
> >+ * allocate a object from that page. As well, freeze all objects
> >+ * allocated in the zspage so anyone cannot access that objects
> >+ * (e.g., zs_map_object, zs_free).
> >+ */
> >+static struct page *isolate_target_page(struct size_class *class)
> >+{
> >+ int i;
> >+ struct page *page;
> >+
> >+ for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
> >+ int inuse, freezed;
> >+
> >+ page = class->fullness_list[i];
> >+ if (!page)
> >+ continue;
> >+
> >+ remove_zspage(class, i, page);
> >+
> >+ inuse = get_zspage_inuse(page);
> >+ freezed = freeze_zspage(class, page);
> >+
> >+ if (inuse != freezed) {
> >+ unfreeze_zspage(class, page, freezed);
> >+ putback_zspage(class, page);
> >+ page = NULL;
> >+ continue;
> >+ }
> >+
> > break;
> > }
> >
> >@@ -1793,9 +1884,11 @@ static struct page *isolate_source_page(struct size_class *class)
> > static unsigned long zs_can_compact(struct size_class *class)
> > {
> > unsigned long obj_wasted;
> >+ unsigned long obj_allocated, obj_used;
> >
> >- obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
> >- zs_stat_get(class, OBJ_USED);
> >+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
> >+ obj_used = zs_stat_get(class, OBJ_USED);
> >+ obj_wasted = obj_allocated - obj_used;
> >
> > obj_wasted /= get_maxobj_per_zspage(class->size,
> > class->pages_per_zspage);
> >@@ -1805,53 +1898,81 @@ static unsigned long zs_can_compact(struct size_class *class)
> >
> > static void __zs_compact(struct zs_pool *pool, struct size_class *class)
> > {
> >- struct zs_compact_control cc;
> >- struct page *src_page;
> >+ struct page *src_page = NULL;
> > struct page *dst_page = NULL;
> >
> >- spin_lock(&class->lock);
> >- while ((src_page = isolate_source_page(class))) {
> >+ while (1) {
> >+ int nr_migrated;
> >
> >- if (!zs_can_compact(class))
> >+ spin_lock(&class->lock);
> >+ if (!zs_can_compact(class)) {
> >+ spin_unlock(&class->lock);
> > break;
> >+ }
> >
> >- cc.index = 0;
> >- cc.s_page = src_page;
> >+ /*
> >+ * Isolate source page and freeze all objects in a zspage
> >+ * to prevent zspage destroying.
> >+ */
> >+ if (!src_page) {
> >+ src_page = isolate_source_page(class);
> >+ if (!src_page) {
> >+ spin_unlock(&class->lock);
> >+ break;
> >+ }
> >+ }
> >
> >- while ((dst_page = isolate_target_page(class))) {
> >- cc.d_page = dst_page;
> >- /*
> >- * If there is no more space in dst_page, resched
> >- * and see if anyone had allocated another zspage.
> >- */
> >- if (!migrate_zspage(pool, class, &cc))
> >+ /* Isolate target page and freeze all objects in the zspage */
> >+ if (!dst_page) {
> >+ dst_page = isolate_target_page(class);
> >+ if (!dst_page) {
> >+ spin_unlock(&class->lock);
> > break;
> >+ }
> >+ }
> >+ spin_unlock(&class->lock);
>
> (Sorry to delete individual recipients due to my compliance issues.)
>
> Hello, Minchan.
>
>
> Is it safe to unlock?
>
>
> (I assume that the system has 2 cores
> and a swap device is using zsmalloc pool.)
> If a zs compact context scheduled out after this "spin_unlock" line,
>
>
> CPU A (Swap In) CPU B (zs_free by process killed)
> --------------------- -------------------------
> ...
> spin_lock(&si->lock)
> ...
> # assume it is pinned by zs_compact context.
> pin_tag(handle) --> block
>
> ...
> spin_lock(&si->lock) --> block
>
>
> I think CPU A and CPU B may not be released forever.
> Am I missing something?

You didn't miss anything. It could be dead locked.
The swap_slot_free_notify is always really problem. :(
That's why I want to remove it.
I will think over how to handle it and send fix in next revision.

Thanks for the review!