Re: mmap/munmap 20ms latencies, 2.6.37

From: Andrew Morton
Date: Fri Jan 14 2011 - 01:47:34 EST


(cc linux-mm)

On Tue, 11 Jan 2011 00:19:28 -0800 Andy Isaacson <adi@xxxxxxxxxxxxx> wrote:

> I'm seeing high latencies (up to 20 ms with no sign of CPU contention)
> from mmap/munmap, using the enclosed test program on various Core i7
> processors. For example on a "Intel(R) Core(TM) i7 CPU 930@xxxxxxx"
> with 12GB RAM running 0c21e3a,
>
> % /tmp/munmapslow -t 6
> ...
> [1294732507.539851] <8636> free(0x7f2608f08000) len=57344 took 4041 us
> [1294732507.559841] <8632> free(0x7f260e810000) len=32768 took 8032 us
> [1294732507.570254] <8632> malloc(81920) took 2452 us
> [1294732507.583834] <8635> malloc(73728) took 8035 us
> [1294732507.595840] <8631> malloc(65536) took 4048 us
> [1294732507.599838] <8635> malloc(45056) took 8041 us
> [1294732507.603834] <8631> malloc(20480) took 4029 us
> ...
> [1294732510.603845] <8634> realloc(0x756170, 23408) took 4035 us
> [1294732510.647850] <8634> malloc(73728) took 28047 us
> [1294732510.675831] <8631> malloc(53248) took 28028 us
> [1294732510.731835] <8631> malloc(81920) took 28035 us
> ...
> [1294732543.685480] <8633> 5244182 heap events, 3739 allocated, 4425 max, 153.57 MiB total
> [1294732543.685503] <8633> 78060.59 ms total for 5244182 allocations (15 us per allocation)
> [1294732543.685509] <8633> 13779.82 ms total for 5240443 frees (3 us per free)
>
> Most of the time, mmap takes about 15 us and munmap takes less than 5 us
> for these small buffers (<20 pages).
>
> Of course if a significant background job fires, a lot of "slow
> operations" are seen; but I'm seeing cases where preemption doesn't
> appear to be the cause. (I suppose I could parse /proc/self/sched to
> confirm this...)
>
> I'll try trace-cmd tomorrow, but I wonder if anyone already knows what's
> going on?
>
> (As you may guess, this is an attempt to isolate a problem seen in a
> larger system that's not easy to reduce to a simple testcase. We're
> seeing a 3x higher system time reported via munin than previously, even
> though the CPUs went from 20%-40% idle to 100% pegged and overall
> throughput *dropped* somewhat due to being CPU limited. It's not clear
> what changed to trigger the increased system time -- but munmap popped
> out of strace -c.)
>
> Thanks,
> -andy
>
> /*
> * Attempt to reproduce munmap slowness seen on
> * 2.6.35-23-generic
> * Intel(R) Xeon(R) CPU E5504 @ 2.00GHz
> * with HT turned on.
> *
> * strace -c shows that munmap is taking up to 11 ms per call and
> * that's where we are spending 95% of our time.
> *
> * Plan for testcase:
> *
> * fork N processes.
> * In each process, repeatedly allocate and free various buffer sizes.
> */
>
> #include <stdio.h>
> #include <stdlib.h>
> #include <stdarg.h>
> #include <string.h>
> #include <errno.h>
>
> #include <sys/time.h>
>
> #include <unistd.h>
> #include <sys/wait.h>
> #include <sys/mman.h>
>
> #define MAX_TIME (1000 * 1e-6)
>
> typedef unsigned int u32;
>
> static int pgsz;
>
> double rtc(void)
> {
> struct timeval t;
>
> gettimeofday(&t, 0);
> return t.tv_sec + t.tv_usec/1e6;
> }
>
> void die(char *fmt, ...) __attribute__((noreturn));
>
> void die(char *fmt, ...)
> {
> va_list ap;
>
> fprintf(stderr, "[%.6f] <%d> ", rtc(), (int)getpid());
> va_start(ap, fmt);
> vfprintf(stderr, fmt, ap);
> va_end(ap);
> exit(1);
> }
>
> void msg(char *fmt, ...)
> {
> va_list ap;
>
> printf("[%.6f] <%d> ", rtc(), (int)getpid());
> va_start(ap, fmt);
> vfprintf(stdout, fmt, ap);
> va_end(ap);
> }
>
> /*
> * returns a random integer chosen from a uniform distribution on [0, range).
> */
> u32 myrand(u32 range)
> {
> u32 x, mask;
> int i, MAX = 1000;
>
> if (range == 1) return 0;
>
> mask = range - 1;
> while (mask & (mask - 1))
> mask &= (mask - 1);
> mask |= (mask - 1);
>
> for (i = 0; i < MAX; i++) {
> x = lrand48() & mask;
> if (x < range)
> return x;
> }
> die("myrand failed after %d iter, range = %lld mask = 0x%llx x = 0x%llx\n",
> i, (long long)range, (long long) mask, (long long)x);
> }
>
> static void **heap;
> static int num_heap, max_num_heap;
> static long long totsz;
>
> static double elapsed_alloc, elapsed_free;
> int num_alloc, num_free;
>
> void heap_stats(void)
> {
> msg("%d heap events, %d allocated, %d max, %.2f MiB total\n",
> num_alloc, num_heap, max_num_heap, totsz / 1024. / 1024);
> msg("%.2f ms total for %d allocations (%.0f us per allocation)\n",
> elapsed_alloc * 1e3, num_alloc, elapsed_alloc * 1e6 / num_alloc);
> msg("%.2f ms total for %d frees (%.0f us per free)\n",
> elapsed_free * 1e3, num_free, elapsed_free * 1e6 / num_alloc);
> }
>
> void *myalloc(size_t sz)
> {
> double t0, t1, e;
> void *p;
>
> t0 = rtc();
> p = mmap(0, sz, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> if (p == MAP_FAILED)
> die("mmap(%d): %s\n", (int)sz, strerror(errno));
> t1 = rtc();
>
> e = t1 - t0;
>
> if (e > MAX_TIME)
> msg("malloc(%d) took %.0f us\n", (int)sz, e * 1e6);
>
> elapsed_alloc += e;
> num_alloc++;
>
> return p;
> }
>
> void myfree(void *p)
> {
> double t0, t1, e;
> int sz = *(int *)p;
>
> t0 = rtc();
> munmap(p, sz);
> t1 = rtc();
>
> e = t1 - t0;
> if (e > MAX_TIME)
> msg("free(%p) len=%d took %.0f us\n", p, sz, e * 1e6);
> elapsed_free += e;
> num_free++;
> }
>
> void heap_allocate(int sz)
> {
> void *n;
> void **h2;
> double t1, t2, e;
>
> n = myalloc(sz);
> if (!n) die("malloc(%d) failed.\n", sz);
>
> *(int *)n = sz;
>
> t1 = rtc();
> h2 = realloc(heap, sizeof(*heap) * ++num_heap);
> t2 = rtc();
> e = t2 - t1;
> if (!h2)
> die("realloc(%p, %d) failed.\n",
> heap, (int)(sizeof(*heap) * num_heap));
>
> if ((t2 - t1) > MAX_TIME)
> msg("realloc(%p, %d) took %.0f us\n",
> heap, (int)(sizeof(*heap) * num_heap), e * 1e6);
>
> if (num_heap > max_num_heap) max_num_heap = num_heap;
> totsz += sz;
>
> h2[num_heap-1] = n;
> heap = h2;
> }
>
> void heap_free(void)
> {
> int i;
>
> if (num_heap <= 0)
> return;
>
> i = myrand(num_heap--);
>
> totsz -= *(int *)heap[i];
>
> if (heap[i])
> myfree(heap[i]);
>
> for (; i < num_heap; i++)
> heap[i] = heap[i+1];
> }
>
> void send_message(void)
> {
> }
>
> void receive_message(void)
> {
> }
>
> void worker(void)
> {
> long long i;
> long long counts[4] = { 0 };
> int ITERCOUNT = 10 * 1024 * 1024;
>
> for (i=0; ; i++) {
> int action = myrand(4);
>
> if (i < 10000 && action == 1 && myrand(2) == 1) action = 0;
> counts[action]++;
> switch(action) {
> case 0:
> heap_allocate(pgsz * (myrand(20) + 1));
> break;
> case 1:
> heap_free();
> break;
> case 2:
> send_message();
> break;
> case 3:
> receive_message();
> break;
> }
> if (i % ITERCOUNT == ITERCOUNT - 1) {
> heap_stats();
> }
> }
> }
>
> void usage(const char *cmd)
> {
> die("Usage: %s [-t numthreads]\n", cmd);
> }
>
> int main(int argc, char **argv)
> {
> int c, i;
> int nproc = sysconf(_SC_NPROCESSORS_ONLN);
>
> pgsz = getpagesize();
>
> while ((c = getopt(argc, argv, "t:")) != EOF) {
> switch (c) {
> case 't':
> nproc = strtol(optarg, 0, 0);
> break;
> default:
> usage(argv[0]);
> }
> }
>
> printf("forking %d children\n", nproc);
> for (i=0; i < nproc; i++) {
> switch(fork()) {
> case -1:
> die("fork: %s\n", strerror(errno));
> case 0: /* child */
> worker();
> exit(0);
> default: /* parent */
> /* nothing */
> break;
> }
> }
>
> for (i=0; i < nproc; i++) {
> int x, p;
>
> p = wait(&x);
> msg("child %d exited\n", p);
> }
>
> return 0;
> }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/