Re: [PATCH RFC v2 4/4] memcg: implement memory thresholds

From: Kirill A. Shutemov
Date: Tue Dec 15 2009 - 05:46:40 EST


On Tue, Dec 15, 2009 at 3:58 AM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
> On Sat, 12 Dec 2009 00:59:19 +0200
> "Kirill A. Shutemov" <kirill@xxxxxxxxxxxxx> wrote:
>
>> It allows to register multiple memory and memsw thresholds and gets
>> notifications when it crosses.
>>
>> To register a threshold application need:
>> - create an eventfd;
>> - open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
>> - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
>> Â cgroup.event_control.
>>
>> Application will be notified through eventfd when memory usage crosses
>> threshold in any direction.
>>
>> It's applicable for root and non-root cgroup.
>>
>> It uses stats to track memory usage, simmilar to soft limits. It checks
>> if we need to send event to userspace on every 100 page in/out. I guess
>> it's good compromise between performance and accuracy of thresholds.
>>
>> Signed-off-by: Kirill A. Shutemov <kirill@xxxxxxxxxxxxx>
>> ---
>> Âmm/memcontrol.c | Â263 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> Â1 files changed, 263 insertions(+), 0 deletions(-)
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index c6081cc..5ba2140 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -6,6 +6,10 @@
>> Â * Copyright 2007 OpenVZ SWsoft Inc
>> Â * Author: Pavel Emelianov <xemul@xxxxxxxxxx>
>> Â *
>> + * Memory thresholds
>> + * Copyright (C) 2009 Nokia Corporation
>> + * Author: Kirill A. Shutemov
>> + *
>> Â * This program is free software; you can redistribute it and/or modify
>> Â * it under the terms of the GNU General Public License as published by
>> Â * the Free Software Foundation; either version 2 of the License, or
>> @@ -38,6 +42,7 @@
>> Â#include <linux/vmalloc.h>
>> Â#include <linux/mm_inline.h>
>> Â#include <linux/page_cgroup.h>
>> +#include <linux/eventfd.h>
>> Â#include "internal.h"
>>
>> Â#include <asm/uaccess.h>
>> @@ -56,6 +61,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
>>
>> Âstatic DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
>> Â#define SOFTLIMIT_EVENTS_THRESH (1000)
>> +#define THRESHOLDS_EVENTS_THRESH (100)
>>
>> Â/*
>> Â * Statistics for memory cgroup.
>> @@ -72,6 +78,8 @@ enum mem_cgroup_stat_index {
>> Â Â Â MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
>> Â Â Â MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â used by soft limit implementation */
>> + Â Â MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â used by threshold implementation */
>>
>> Â Â Â MEM_CGROUP_STAT_NSTATS,
>> Â};
>> @@ -182,6 +190,15 @@ struct mem_cgroup_tree {
>>
>> Âstatic struct mem_cgroup_tree soft_limit_tree __read_mostly;
>>
>> +struct mem_cgroup_threshold {
>> + Â Â struct list_head list;
>> + Â Â struct eventfd_ctx *eventfd;
>> + Â Â u64 threshold;
>> +};
>> +
>> +static bool mem_cgroup_threshold_check(struct mem_cgroup* mem);
>> +static void mem_cgroup_threshold(struct mem_cgroup* mem, bool swap);
>> +
>> Â/*
>> Â * The memory controller data structure. The memory controller controls both
>> Â * page cache and RSS per cgroup. We would eventually like to provide
>> @@ -233,6 +250,19 @@ struct mem_cgroup {
>> Â Â Â /* set when res.limit == memsw.limit */
>>    bool      Âmemsw_is_minimum;
>>
>> + Â Â /* protect lists of thresholds*/
>> + Â Â spinlock_t thresholds_lock;
>> +
>> + Â Â /* thresholds for memory usage */
>> + Â Â struct list_head thresholds;
>> + Â Â struct mem_cgroup_threshold *below_threshold;
>> + Â Â struct mem_cgroup_threshold *above_threshold;
>> +
>> + Â Â /* thresholds for mem+swap usage */
>> + Â Â struct list_head memsw_thresholds;
>> + Â Â struct mem_cgroup_threshold *memsw_below_threshold;
>> + Â Â struct mem_cgroup_threshold *memsw_above_threshold;
>> +
>> Â Â Â /*
>> Â Â Â Â* statistics. This must be placed at the end of memcg.
>> Â Â Â Â*/
>> @@ -519,6 +549,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
>> Â Â Â Â Â Â Â __mem_cgroup_stat_add_safe(cpustat,
>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
>> Â Â Â __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
>> + Â Â __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
>> +
>> Â Â Â put_cpu();
>> Â}
>>
>> @@ -1363,6 +1395,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
>> Â Â Â if (mem_cgroup_soft_limit_check(mem))
>> Â Â Â Â Â Â Â mem_cgroup_update_tree(mem, page);
>> Âdone:
>> + Â Â if (mem_cgroup_threshold_check(mem)) {
>> + Â Â Â Â Â Â mem_cgroup_threshold(mem, false);
>> + Â Â Â Â Â Â if (do_swap_account)
>> + Â Â Â Â Â Â Â Â Â Â mem_cgroup_threshold(mem, true);
>> + Â Â }
>> Â Â Â return 0;
>> Ânomem:
>> Â Â Â css_put(&mem->css);
>> @@ -1906,6 +1943,11 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>>
>> Â Â Â if (mem_cgroup_soft_limit_check(mem))
>> Â Â Â Â Â Â Â mem_cgroup_update_tree(mem, page);
>> + Â Â if (mem_cgroup_threshold_check(mem)) {
>> + Â Â Â Â Â Â mem_cgroup_threshold(mem, false);
>> + Â Â Â Â Â Â if (do_swap_account)
>> + Â Â Â Â Â Â Â Â Â Â mem_cgroup_threshold(mem, true);
>> + Â Â }
>> Â Â Â /* at swapout, this memcg will be accessed to record to swap */
>> Â Â Â if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
>> Â Â Â Â Â Â Â css_put(&mem->css);
>> @@ -2860,11 +2902,181 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
>> Â}
>>
>>
>> +static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
>> +{
>> + Â Â bool ret = false;
>> + Â Â int cpu;
>> + Â Â s64 val;
>> + Â Â struct mem_cgroup_stat_cpu *cpustat;
>> +
>> + Â Â cpu = get_cpu();
>> + Â Â cpustat = &mem->stat.cpustat[cpu];
>> + Â Â val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
>> + Â Â if (unlikely(val < 0)) {
>> + Â Â Â Â Â Â __mem_cgroup_stat_set(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â THRESHOLDS_EVENTS_THRESH);
>> + Â Â Â Â Â Â ret = true;
>> + Â Â }
>> + Â Â put_cpu();
>> + Â Â return ret;
>> +}
>> +
>
> Hmm. please check
>
> Â Â Â Âif (likely(list_empty(&mem->thesholds) &&
> Â Â Â Â Â Â Â Â Â list_empty(&mem->memsw_thresholds)))
> Â Â Â Â Â Â Â Âreturn;

These lists are never be empty. They have at least two fake threshold for 0
and RESOURCE_MAX.

>
> or adds a flag as mem->no_threshold_check to skip this routine quickly.
>
> _OR_
> I personally don't like to have 2 counters to catch events.
>
> How about this ?
>
> Â adds
> Â struct mem_cgroup {
>    Âatomic_t    Âevent_counter; // this is incremented per 32
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â page-in/out
> Â Â Â Âatomic_t last_softlimit_check;
> Â Â Â Âatomic_t last_thresh_check;
> Â };
>
> static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
> {
> Â Â Â Âdecrement percpu event counter.
> Â Â Â Âif (percpu counter reaches 0) {
> Â Â Â Â Â Â Â Âif Â(atomic_dec_and_test(&mem->check_thresh) {
> Â Â Â Â Â Â Â Â Â Â Â Âcheck threashold.
> Â Â Â Â Â Â Â Â Â Â Â Âreset counter.
> Â Â Â Â Â Â Â Â}
> Â Â Â Â Â Â Â Âif Â(atomic_dec_and_test(&memc->check_softlimit) {
> Â Â Â Â Â Â Â Â Â Â Â Âupdate softlimit tree.
> Â Â Â Â Â Â Â Â Â Â Â Âreset counter.
> Â Â Â Â Â Â Â Â}
> Â Â Â Â Â Â Â Âreset percpu counter.
> Â Â Â Â}
> }
>
> Then, you can have a counter like system-wide event counter.

I leave it as is for now, as you mention in other letter.

>> +static void mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
>> +{
>> + Â Â struct mem_cgroup_threshold **below, **above;
>> + Â Â struct list_head *thresholds;
>> + Â Â u64 usage = mem_cgroup_usage(memcg, swap);
>> +
>> + Â Â if (!swap) {
>> + Â Â Â Â Â Â thresholds = &memcg->thresholds;
>> + Â Â Â Â Â Â above = &memcg->above_threshold;
>> + Â Â Â Â Â Â below = &memcg->below_threshold;
>> + Â Â } else {
>> + Â Â Â Â Â Â thresholds = &memcg->memsw_thresholds;
>> + Â Â Â Â Â Â above = &memcg->memsw_above_threshold;
>> + Â Â Â Â Â Â below = &memcg->memsw_below_threshold;
>> + Â Â }
>> +
>> + Â Â spin_lock(&memcg->thresholds_lock);
>> + Â Â if ((*above)->threshold <= usage) {
>> + Â Â Â Â Â Â *below = *above;
>> + Â Â Â Â Â Â list_for_each_entry_continue((*above), thresholds, list) {
>> + Â Â Â Â Â Â Â Â Â Â eventfd_signal((*below)->eventfd, 1);
>> + Â Â Â Â Â Â Â Â Â Â if ((*above)->threshold > usage)
>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â break;
>> + Â Â Â Â Â Â Â Â Â Â *below = *above;
>> + Â Â Â Â Â Â }
>> + Â Â } else if ((*below)->threshold > usage) {
>> + Â Â Â Â Â Â *above = *below;
>> + Â Â Â Â Â Â list_for_each_entry_continue_reverse((*below), thresholds,
>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â list) {
>> + Â Â Â Â Â Â Â Â Â Â eventfd_signal((*above)->eventfd, 1);
>> + Â Â Â Â Â Â Â Â Â Â if ((*below)->threshold <= usage)
>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â break;
>> + Â Â Â Â Â Â Â Â Â Â *above = *below;
>> + Â Â Â Â Â Â }
>> + Â Â }
>> + Â Â spin_unlock(&memcg->thresholds_lock);
>> +}
>
> Could you adds comment on above check ?

I'll add comments in next version of patchset.

> And do we need *spin_lock* here ? Can't you use RCU list walk ?

I'll play with it.

> If you use have to use spinlock here, this is a system-wide spinlock,
> threshold as "100" is too small, I think.

What is reasonable value for THRESHOLDS_EVENTS_THRESH for you?

In most cases spinlock taken only for two checks. Is it significant time?

Unfortunately, I can't test it on a big box. I have only dual-core system.
It's not enough to test scalability.

> Even if you can't use spinlock, please use mutex. (with checking gfp_mask).
>
> Thanks,
> -Kame
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/