Re: [PATCH v3 2/5] zswap: make shrinking memcg-aware
From: Yosry Ahmed
Date: Thu Oct 19 2023 - 12:15:30 EST
[..]
> > >
> > > +/*********************************
> > > +* lru functions
> > > +**********************************/
> > > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> > > +{
> > > + struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
> >
> > Could we avoid the need for get/put with an rcu_read_lock() instead?
>
> I think we can, I'm not entirely sure of the consequences though. By the
> look of it I'd say it's safe but I wouldn't trust my judgement on this.
It just seems like we have a pattern of short-lived get/put. If RCU
gives enough protection it should be simpler. IIUC taking a reference
does not protect against offlining or reparenting, so I am not sure if
taking a reference here would provide any more protection than
>
> >
[..]
> > > @@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> > > zswap_entry_put(tree, entry);
> > > unlock:
> > > spin_unlock(&tree->lock);
> > > - return ret ? -EAGAIN : 0;
> > > + spin_lock(lock);
> > > + return ret;
> > > +}
> > > +
> > > +static int shrink_memcg(struct mem_cgroup *memcg)
> > > +{
> > > + struct zswap_pool *pool;
> > > + int nid, shrunk = 0;
> > > +
> > > + pool = zswap_pool_current_get();
> > > + if (!pool)
> > > + return -EINVAL;
> > > +
> > > + /*
> > > + * Skip zombies because their LRUs are reparented and we would be
> > > + * reclaiming from the parent instead of the dead memcgroup.
> >
> > nit: s/memcgroup/memcg.
> >
> > > + */
> > > + if (memcg && !mem_cgroup_online(memcg))
> > > + goto out;
> >
> > If we move this above zswap_pool_current_get(), we can return directly
> > and remove the label. I noticed we will return -EAGAIN if memcg is
> > offline. IIUC -EAGAIN for the caller will move on to the next memcg,
> > but I am wondering if a different errno would be clearer here.
>
> True, I remember spending some time staring at error codes but couldn't find a
> better one. What if we use -EINVAL for retryable errors, and use something else
> for the one where there is no pool? -ENODEV?
Do you mean -EINVAL for non-retryable errors? Perhaps -ENOENT is more
appropriate as a return for offline memcgs?
>
> >
[..]
> > > static void shrink_worker(struct work_struct *w)
> > > @@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w)
> > > shrink_work);
> > > int ret, failures = 0;
> > >
> > > + /* global reclaim will select cgroup in a round-robin fashion. */
> > > do {
> > > - ret = zswap_reclaim_entry(pool);
> > > + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
> >
> > Perhaps next_shrink_memcg is a better name here?
>
> Will change if you have a strong preference, I'd keep it shorter because it's
> always used in conjunction with a memcg type or function.
I'd rather have the more explicit name unless it causes some annoying
line breaks or so.
>
> >
> > > +
> > > + ret = shrink_memcg(pool->next_shrink);
> > > +
> > > if (ret) {
> > > - zswap_reject_reclaim_fail++;
> > > if (ret != -EAGAIN)
> > > break;
> > > if (++failures == MAX_RECLAIM_RETRIES)
> > > @@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> > > */
> > > kref_init(&pool->kref);
> > > INIT_LIST_HEAD(&pool->list);
> > > - INIT_LIST_HEAD(&pool->lru);
> > > - spin_lock_init(&pool->lru_lock);
> > > + list_lru_init_memcg(&pool->list_lru, NULL);
> > > INIT_WORK(&pool->shrink_work, shrink_worker);
> > >
> > > zswap_pool_debug("created", pool);
> > > @@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> > >
> > > cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> > > free_percpu(pool->acomp_ctx);
> > > + list_lru_destroy(&pool->list_lru);
> > > + if (pool->next_shrink)
> > > + mem_cgroup_put(pool->next_shrink);
> > > for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> > > zpool_destroy_pool(pool->zpools[i]);
> > > kfree(pool);
> > > @@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > >
> > > /* try to allocate swap cache page */
> > > page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
> > > - &page_was_allocated);
> > > + &page_was_allocated, true);
> > > if (!page) {
> > > ret = -ENOMEM;
> > > goto fail;
> > > @@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > > /* start writeback */
> > > __swap_writepage(page, &wbc);
> > > put_page(page);
> > > - zswap_written_back_pages++;
> > >
> > > return ret;
> > >
> > > @@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio)
> > > struct scatterlist input, output;
> > > struct crypto_acomp_ctx *acomp_ctx;
> > > struct obj_cgroup *objcg = NULL;
> > > + struct mem_cgroup *memcg = NULL;
> > > struct zswap_pool *pool;
> > > struct zpool *zpool;
> > > + int lru_alloc_ret;
> > > unsigned int dlen = PAGE_SIZE;
> > > unsigned long handle, value;
> > > char *buf;
> > > @@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio)
> > > zswap_invalidate_entry(tree, dupentry);
> > > }
> > > spin_unlock(&tree->lock);
> > > -
> > > - /*
> > > - * XXX: zswap reclaim does not work with cgroups yet. Without a
> > > - * cgroup-aware entry LRU, we will push out entries system-wide based on
> > > - * local cgroup limits.
> > > - */
> > > objcg = get_obj_cgroup_from_folio(folio);
> > > - if (objcg && !obj_cgroup_may_zswap(objcg))
> > > - goto reject;
> > > + if (objcg && !obj_cgroup_may_zswap(objcg)) {
> > > + memcg = get_mem_cgroup_from_objcg(objcg);
> > > + if (shrink_memcg(memcg)) {
> > > + mem_cgroup_put(memcg);
> > > + goto reject;
> > > + }
> > > + mem_cgroup_put(memcg);
> > > + }
> > >
> > > /* reclaim space if needed */
> > > if (zswap_is_full()) {
> > > @@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio)
> > > zswap_pool_reached_full = false;
> > > }
> > >
> > > + pool = zswap_pool_current_get();
> > > + if (!pool)
> > > + goto reject;
> > > +
> >
> > Why do we need to move zswap_pool_current_get() up here?
>
> Ah, thanks. This is a leftover from a previous version where the pool was needed
> to allocate the entry.
>
>
> >
> > > /* allocate entry */
> > > - entry = zswap_entry_cache_alloc(GFP_KERNEL);
> > > + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> > > if (!entry) {
> > > zswap_reject_kmemcache_fail++;
> > > + zswap_pool_put(pool);
> > > goto reject;
> > > }
> > >
> > > @@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio)
> > > entry->length = 0;
> > > entry->value = value;
> > > atomic_inc(&zswap_same_filled_pages);
> > > + zswap_pool_put(pool);
> > > goto insert_entry;
> > > }
> > > kunmap_atomic(src);
> > > @@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio)
> > > goto freepage;
> > >
> > > /* if entry is successfully added, it keeps the reference */
> > > - entry->pool = zswap_pool_current_get();
> > > - if (!entry->pool)
> > > - goto freepage;
> > > + entry->pool = pool;
> > > + if (objcg) {
> > > + memcg = get_mem_cgroup_from_objcg(objcg);
> > > + lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
> > > + mem_cgroup_put(memcg);
> > > +
> > > + if (lru_alloc_ret)
> > > + goto freepage;
> > > + }
> > >
> > > /* compress */
> > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > > @@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio)
> > > zswap_invalidate_entry(tree, dupentry);
> > > }
> > > if (entry->length) {
> > > - spin_lock(&entry->pool->lru_lock);
> > > - list_add(&entry->lru, &entry->pool->lru);
> > > - spin_unlock(&entry->pool->lru_lock);
> > > + INIT_LIST_HEAD(&entry->lru);
> > > + zswap_lru_add(&pool->list_lru, entry);
> > > }
> > > spin_unlock(&tree->lock);
> > >
> > > @@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
> > >
> > > put_dstmem:
> > > mutex_unlock(acomp_ctx->mutex);
> > > - zswap_pool_put(entry->pool);
> > > freepage:
> > > + zswap_pool_put(entry->pool);
> > > zswap_entry_cache_free(entry);
> > > reject:
> > > if (objcg)
> > > @@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio)
> > > zswap_invalidate_entry(tree, entry);
> > > folio_mark_dirty(folio);
> > > } else if (entry->length) {
> > > - spin_lock(&entry->pool->lru_lock);
> > > - list_move(&entry->lru, &entry->pool->lru);
> > > - spin_unlock(&entry->pool->lru_lock);
> > > + zswap_lru_del(&entry->pool->list_lru, entry);
> > > + zswap_lru_add(&entry->pool->list_lru, entry);
> > > }
> > > zswap_entry_put(tree, entry);
> > > spin_unlock(&tree->lock);
> > > --
> > > 2.34.1