Re: [mm] 4e2c82a409: ltp.overcommit_memory01.fail

From: Feng Tang
Date: Sun Jul 05 2020 - 00:51:15 EST


On Thu, Jul 02, 2020 at 03:12:30PM +0800, Feng Tang wrote:
> > <<<test_start>>>
> > tag=overcommit_memory01 stime=1593425044
> > cmdline="overcommit_memory"
> > contacts=""
> > analysis=exit
> > <<<test_output>>>
> > tst_test.c:1247: INFO: Timeout per run is 0h 05m 00s
> > overcommit_memory.c:116: INFO: MemTotal is 16394252 kB
> > overcommit_memory.c:118: INFO: SwapTotal is 268435452 kB
> > overcommit_memory.c:122: INFO: CommitLimit is 276632576 kB
> > mem.c:817: INFO: set overcommit_ratio to 50
> > mem.c:817: INFO: set overcommit_memory to 2
> > overcommit_memory.c:187: INFO: malloc 551061440 kB failed
> > overcommit_memory.c:208: PASS: alloc failed as expected
> > overcommit_memory.c:183: INFO: malloc 276632576 kB successfully
> > overcommit_memory.c:210: FAIL: alloc passed, expected to fail
>
> Thanks for the report!
>
> I took a rough look, and it all happens after changing the
> overcommit policy from a looser one to OVERCOMMIT_NEVER. I suspect
> it is due to the same cause as the previous warning message reported
> by Qian Cai https://lore.kernel.org/lkml/20200526181459.GD991@xxxxxx/
>
> Will further check it.

I did reproduce the problem, and from the debugging, this should
be the same root cause as https://lore.kernel.org/lkml/20200526181459.GD991@xxxxxx/
that loosing the batch cause some accuracy problem, and the solution of
adding some sync is still needed, which is dicussed in

First thing I tried a simple patch of using percpucounter_sum_read, and
the problem can't be reproduced:

--- a/mm/util.c
+++ b/mm/util.c
@@ -845,7 +845,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
allowed -= min_t(long, mm->total_vm / 32, reserve);
}

- if (percpu_counter_read_positive(&vm_committed_as) < allowed)
+ if (percpu_counter_sum(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);


Then, I tried the sync patch we've discussed one month ago
https://lore.kernel.org/lkml/20200529154315.GI93879@xxxxxxxxxxxxxxxxxxxxxxx/
with it, I run the case 200 times and the problem was not reproduced,
can we consider taking this patch?

Thanks,
Feng

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 0a4f54d..01861ee 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -44,6 +44,7 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
+void percpu_counter_sync(struct percpu_counter *fbc);

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
@@ -172,6 +173,9 @@ static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
return true;
}

+static inline void percpu_counter_sync(struct percpu_counter *fbc)
+{
+}
#endif /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index a66595b..d025137 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -98,6 +98,20 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
}
EXPORT_SYMBOL(percpu_counter_add_batch);

+void percpu_counter_sync(struct percpu_counter *fbc)
+{
+ unsigned long flags;
+ s64 count;
+
+ raw_spin_lock_irqsave(&fbc->lock, flags);
+ count = __this_cpu_read(*fbc->counters);
+ fbc->count += count;
+ __this_cpu_sub(*fbc->counters, count);
+ raw_spin_unlock_irqrestore(&fbc->lock, flags);
+}
+EXPORT_SYMBOL(percpu_counter_sync);
+
+
/*
* Add up all the per-cpu counts, return the result. This is a more accurate
* but much slower version of percpu_counter_read_positive()
diff --git a/mm/util.c b/mm/util.c
index 98813da..8b9664e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -746,14 +746,23 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
return ret;
}

+static void sync_overcommit_as(struct work_struct *dummy)
+{
+ percpu_counter_sync(&vm_committed_as);
+}
+
int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;

ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (ret == 0 && write)
+ if (ret == 0 && write) {
+ if (sysctl_overcommit_memory == OVERCOMMIT_NEVER)
+ schedule_on_each_cpu(sync_overcommit_as);
+
mm_compute_batch();
+ }

pr_info("ocommit=%lld, real=%lld policy[%d] ratio=%d\n\n\n",
percpu_counter_read_positive(&vm_committed_as),