[RFC PATCH] mm: madvise: Ignore repeated MADV_DONTNEED hints
From: Mel Gorman
Date: Mon Feb 02 2015 - 11:55:37 EST
glibc malloc changed behaviour in glibc 2.10 to have per-thread arenas
instead of creating new areans if the existing ones were contended.
The decision appears to have been made so the allocator scales better but the
downside is that madvise(MADV_DONTNEED) is now called for these per-thread
areans during free. This tears down pages that would have previously
remained. There is nothing wrong with this decision from a functional point
of view but any threaded application that frequently allocates/frees the
same-sized region is going to incur the full teardown and refault costs.
This is extremely obvious in the ebizzy benchmark. At its core, threads are
frequently freeing and allocating buffers of the same size. It is much faster
on distributions with older versions of glibc. Profiles showed that a large
amount of system CPU time was spent on tearing down and refaulting pages.
This patch identifies when a thread is frequently calling MADV_DONTNEED
on the same region of memory and starts ignoring the hint. On an 8-core
single-socket machine this was the impact on ebizzy using glibc 2.19.
ebizzy Overall Throughput
3.19.0-rc6 3.19.0-rc6
vanilla madvise-v1r1
Hmean Rsec-1 12619.93 ( 0.00%) 34807.02 (175.81%)
Hmean Rsec-3 33434.19 ( 0.00%) 100733.77 (201.29%)
Hmean Rsec-5 45796.68 ( 0.00%) 134257.34 (193.16%)
Hmean Rsec-7 53146.93 ( 0.00%) 145512.85 (173.79%)
Hmean Rsec-12 55132.87 ( 0.00%) 145560.86 (164.02%)
Hmean Rsec-18 54846.52 ( 0.00%) 145120.79 (164.59%)
Hmean Rsec-24 54368.95 ( 0.00%) 142733.89 (162.53%)
Hmean Rsec-30 54388.86 ( 0.00%) 141424.09 (160.02%)
Hmean Rsec-32 54047.11 ( 0.00%) 139151.76 (157.46%)
And the system CPU usage was also much reduced
3.19.0-rc6 3.19.0-rc6
vanilla madvise-v1r1
User 2647.19 8347.26
System 5742.90 42.42
Elapsed 1350.60 1350.65
It's even more ridiculous on a 4 socket machine
ebizzy Overall Throughput
3.19.0-rc6 3.19.0-rc6
vanilla madvise-v1r1
Hmean Rsec-1 5354.37 ( 0.00%) 12838.61 (139.78%)
Hmean Rsec-4 10338.41 ( 0.00%) 50514.52 (388.61%)
Hmean Rsec-7 7766.33 ( 0.00%) 88555.30 (1040.25%)
Hmean Rsec-12 7188.40 ( 0.00%) 154180.78 (2044.86%)
Hmean Rsec-21 7001.82 ( 0.00%) 266555.51 (3706.95%)
Hmean Rsec-30 8975.08 ( 0.00%) 314369.88 (3402.70%)
Hmean Rsec-48 12136.53 ( 0.00%) 358525.74 (2854.10%)
Hmean Rsec-79 12607.37 ( 0.00%) 341646.49 (2609.89%)
Hmean Rsec-110 12563.37 ( 0.00%) 338058.65 (2590.83%)
Hmean Rsec-141 11701.85 ( 0.00%) 331255.78 (2730.80%)
Hmean Rsec-172 10987.39 ( 0.00%) 312003.62 (2739.65%)
Hmean Rsec-192 12050.46 ( 0.00%) 296401.88 (2359.67%)
3.19.0-rc6 3.19.0-rc6
vanilla madvise-v1r1
User 4136.44 53506.65
System 50262.68 906.49
Elapsed 1802.07 1801.99
Note in both cases that the elapsed time is similar because the benchmark
is configured to run for a fixed duration.
MADV_FREE would have a lower cost if the underlying allocator used it but
there is no guarantee that allocators will use it. Arguably the kernel
has no business preventing an application developer shooting themselves
in a foot but this is a case where it's relatively easy to detect the bad
behaviour and avoid it.
Signed-off-by: Mel Gorman <mgorman@xxxxxxx>
---
fs/exec.c | 4 ++++
include/linux/sched.h | 5 +++++
kernel/fork.c | 5 +++++
mm/madvise.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 70 insertions(+)
diff --git a/fs/exec.c b/fs/exec.c
index ad8798e26be9..5c691fcc32f4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1551,6 +1551,10 @@ static int do_execveat_common(int fd, struct filename *filename,
current->in_execve = 0;
acct_update_integrals(current);
task_numa_free(current);
+ if (current->madvise_state) {
+ kfree(current->madvise_state);
+ current->madvise_state = NULL;
+ }
free_bprm(bprm);
kfree(pathbuf);
putname(filename);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef98d2f..b6706bdb27fd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1271,6 +1271,9 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};
+/* mm/madvise.c */
+struct madvise_state_info;
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1637,6 +1640,8 @@ struct task_struct {
struct page_frag task_frag;
+ struct madvise_state_info *madvise_state;
+
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2ddade9f1..6d8dd1379240 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -246,6 +246,11 @@ void __put_task_struct(struct task_struct *tsk)
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
+ if (current->madvise_state) {
+ kfree(current->madvise_state);
+ current->madvise_state = NULL;
+ }
+
if (!profile_handoff_task(tsk))
free_task(tsk);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc93289..907bb0922711 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -19,6 +19,7 @@
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/vmacache.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -251,6 +252,57 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+#define MADVISE_HASH VMACACHE_HASH
+#define MADVISE_STATE_SIZE VMACACHE_SIZE
+#define MADVISE_THRESHOLD 8
+
+struct madvise_state_info {
+ unsigned long start;
+ unsigned long end;
+ int count;
+ unsigned long jiffies;
+};
+
+/* Returns true if userspace is continually dropping the same address range */
+static bool ignore_madvise_hint(unsigned long start, unsigned long end)
+{
+ int i;
+
+ if (!current->madvise_state)
+ current->madvise_state = kzalloc(sizeof(struct madvise_state_info) * MADVISE_STATE_SIZE, GFP_KERNEL);
+ if (!current->madvise_state)
+ return false;
+
+ i = VMACACHE_HASH(start);
+ if (current->madvise_state[i].start != start ||
+ current->madvise_state[i].end != end) {
+ /* cache miss */
+ current->madvise_state[i].start = start;
+ current->madvise_state[i].end = end;
+ current->madvise_state[i].count = 0;
+ current->madvise_state[i].jiffies = jiffies;
+ } else {
+ /* cache hit */
+ unsigned long reset = current->madvise_state[i].jiffies + HZ;
+ if (time_after(jiffies, reset)) {
+ /*
+ * If it is a second since the last madvise on this
+ * range or since madvise hints got ignored then reset
+ * the counts and apply the hint again.
+ */
+ current->madvise_state[i].count = 0;
+ current->madvise_state[i].jiffies = jiffies;
+ } else
+ current->madvise_state[i].count++;
+
+ if (current->madvise_state[i].count > MADVISE_THRESHOLD)
+ return true;
+ current->madvise_state[i].jiffies = jiffies;
+ }
+
+ return false;
+}
+
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
@@ -278,6 +330,10 @@ static long madvise_dontneed(struct vm_area_struct *vma,
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
return -EINVAL;
+ /* Ignore hint if madvise is continually dropping the same range */
+ if (ignore_madvise_hint(start, end))
+ return 0;
+
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
struct zap_details details = {
.nonlinear_vma = vma,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/