Re: [RFC][PATCH 3/4] Memory controller soft limit organize cgroups (v2)
From: KOSAKI Motohiro
Date: Mon Feb 16 2009 - 20:00:25 EST
> /*
> + * Cgroups above their limits are maintained in a RB-Tree, independent of
> + * their hierarchy representation
> + */
> +
> +static struct rb_root mem_cgroup_soft_limit_exceeded_groups;
37 length variable name seems too long.
> +static DEFINE_SPINLOCK(memcg_soft_limit_tree_lock);
> +
> +/*
> * The memory controller data structure. The memory controller controls both
> * page cache and RSS per cgroup. We would eventually like to provide
> * statistics based on the statistics developed by Rik Van Riel for clock-pro,
> @@ -176,12 +185,18 @@ struct mem_cgroup {
>
> unsigned int swappiness;
>
> + struct rb_node mem_cgroup_node;
> + unsigned long long usage_in_excess;
> + unsigned long last_tree_update;
> +
no comment fields.
Do usage_in_excess and last_tree_update have what unit? "unsigned long"
don't tell me anything.
> /*
> * statistics. This must be placed at the end of memcg.
> */
> struct mem_cgroup_stat stat;
> };
>
> +#define MEM_CGROUP_TREE_UPDATE_INTERVAL (HZ)
> +
In general, memory subsystem be considered to shouldn't have timer thing.
it's because we expect we get 100x times faster machine after 10 year,
at that time, we expect proper timeout value is changed.
Can we make proper stastics, instead?
> enum charge_type {
> MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
> MEM_CGROUP_CHARGE_TYPE_MAPPED,
> @@ -214,6 +229,41 @@ static void mem_cgroup_get(struct mem_cgroup *mem);
> static void mem_cgroup_put(struct mem_cgroup *mem);
> static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
>
> +static void mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
> +{
> + struct rb_node **p = &mem_cgroup_soft_limit_exceeded_groups.rb_node;
> + struct rb_node *parent = NULL;
> + struct mem_cgroup *mem_node;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
> + while (*p) {
> + parent = *p;
> + mem_node = rb_entry(parent, struct mem_cgroup, mem_cgroup_node);
> + if (mem->usage_in_excess < mem_node->usage_in_excess)
> + p = &(*p)->rb_left;
> + /*
> + * We can't avoid mem cgroups that are over their soft
> + * limit by the same amount
> + */
> + else if (mem->usage_in_excess >= mem_node->usage_in_excess)
> + p = &(*p)->rb_right;
> + }
> + rb_link_node(&mem->mem_cgroup_node, parent, p);
> + rb_insert_color(&mem->mem_cgroup_node,
> + &mem_cgroup_soft_limit_exceeded_groups);
> + mem->last_tree_update = jiffies;
> + spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
> +}
I think this function is called from page fault hotpath, right?
if so, you insert global lock into hotpath!
> +
> +static void mem_cgroup_remove_exceeded(struct mem_cgroup *mem)
> +{
> + unsigned long flags;
> + spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
> + rb_erase(&mem->mem_cgroup_node, &mem_cgroup_soft_limit_exceeded_groups);
> + spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
> +}
> +
> static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
> struct page_cgroup *pc,
> bool charge)
> @@ -897,6 +947,39 @@ static void record_last_oom(struct mem_cgroup *mem)
> mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
> }
>
> +static void mem_cgroup_check_and_update_tree(struct mem_cgroup *mem,
> + bool time_check)
> +{
> + unsigned long long prev_usage_in_excess, new_usage_in_excess;
> + bool updated_tree = false;
> + unsigned long next_update = 0;
> + unsigned long flags;
> +
> + mem_cgroup_get(mem);
> + prev_usage_in_excess = mem->usage_in_excess;
> + new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
> +
> + if (time_check)
> + next_update = mem->last_tree_update +
> + MEM_CGROUP_TREE_UPDATE_INTERVAL;
> + if (new_usage_in_excess && time_after(jiffies, next_update)) {
incorrect time_after() usage. jiffies can round-tripping. then
time_after(jiffies, 0) don't gurantee to return true.
> + if (prev_usage_in_excess)
> + mem_cgroup_remove_exceeded(mem);
> + mem_cgroup_insert_exceeded(mem);
> + updated_tree = true;
> + } else if (prev_usage_in_excess && !new_usage_in_excess) {
> + mem_cgroup_remove_exceeded(mem);
> + updated_tree = true;
> + }
> +
> + if (updated_tree) {
> + spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
> + mem->last_tree_update = jiffies;
> + mem->usage_in_excess = new_usage_in_excess;
> + spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
> + }
> + mem_cgroup_put(mem);
> +}
>
> /*
> * Unlike exported interface, "oom" parameter is added. if oom==true,
> @@ -906,9 +989,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> gfp_t gfp_mask, struct mem_cgroup **memcg,
> bool oom)
> {
> - struct mem_cgroup *mem, *mem_over_limit;
> + struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
> int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> - struct res_counter *fail_res;
> + struct res_counter *fail_res, *soft_fail_res = NULL;
>
> if (unlikely(test_thread_flag(TIF_MEMDIE))) {
> /* Don't account this! */
> @@ -938,12 +1021,13 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> int ret;
> bool noswap = false;
>
> - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
> + ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
> + &soft_fail_res);
> if (likely(!ret)) {
> if (!do_swap_account)
> break;
> ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
> - &fail_res);
> + &fail_res, NULL);
> if (likely(!ret))
> break;
> /* mem+swap counter fails */
> @@ -985,6 +1069,13 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> goto nomem;
> }
> }
> +
> + if (soft_fail_res) {
> + mem_over_soft_limit =
> + mem_cgroup_from_res_counter(soft_fail_res, res);
> + mem_cgroup_check_and_update_tree(mem_over_soft_limit, true);
> + }
> + mem_cgroup_check_and_update_tree(mem, true);
> return 0;
> nomem:
> css_put(&mem->css);
> @@ -1422,6 +1513,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> mz = page_cgroup_zoneinfo(pc);
> unlock_page_cgroup(pc);
>
> + mem_cgroup_check_and_update_tree(mem, true);
> /* at swapout, this memcg will be accessed to record to swap */
> if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
> css_put(&mem->css);
> @@ -2346,6 +2438,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
> {
> int node;
>
> + mem_cgroup_check_and_update_tree(mem, false);
> free_css_id(&mem_cgroup_subsys, &mem->css);
>
> for_each_node_state(node, N_POSSIBLE)
> @@ -2412,6 +2505,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> if (cont->parent == NULL) {
> enable_swap_cgroup();
> parent = NULL;
> + mem_cgroup_soft_limit_exceeded_groups = RB_ROOT;
> } else {
> parent = mem_cgroup_from_cont(cont->parent);
> mem->use_hierarchy = parent->use_hierarchy;
> @@ -2432,6 +2526,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> res_counter_init(&mem->memsw, NULL);
> }
> mem->last_scanned_child = 0;
> + mem->usage_in_excess = 0;
> + mem->last_tree_update = 0; /* Yes, time begins at 0 here */
> spin_lock_init(&mem->reclaim_param_lock);
>
> if (parent)
>
> --
> Balbir
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/