Postgresql performance problems with IO latency, especially during fsync()

From: Andres Freund
Date: Wed Mar 26 2014 - 15:19:37 EST


Hi,

At LSF/MM there was a slot about postgres' problems with the kernel. Our
top#1 concern is frequent slow read()s that happen while another process
calls fsync(), even though we'd be perfectly fine if that fsync() took
ages.
The "conclusion" of that part was that it'd be very useful to have a
demonstration of the problem without needing a full blown postgres
setup. I've quickly hacked something together, that seems to show the
problem nicely.

For a bit of context: lwn.net/SubscriberLink/591723/940134eb57fcc0b8/
and the "IO Scheduling" bit in
http://archives.postgresql.org/message-id/20140310101537.GC10663%40suse.de

The tools output looks like this:
gcc -std=c99 -Wall -ggdb ~/tmp/ioperf.c -o ioperf && ./ioperf
...
wal[12155]: avg: 0.0 msec; max: 0.0 msec
commit[12155]: avg: 0.2 msec; max: 15.4 msec
wal[12155]: avg: 0.0 msec; max: 0.0 msec
read[12157]: avg: 0.2 msec; max: 9.4 msec
...
read[12165]: avg: 0.2 msec; max: 9.4 msec
wal[12155]: avg: 0.0 msec; max: 0.0 msec
starting fsync() of files
finished fsync() of files
read[12162]: avg: 0.6 msec; max: 2765.5 msec

So, the average read time is less than one ms (SSD, and about 50% cached
workload). But once another backend does the fsync(), read latency
skyrockets.

A concurrent iostat shows the problem pretty clearly:
Device: rrqm/s wrqm/s r/s w/s rMB/s wMB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 1.00 0.00 6322.00 337.00 51.73 4.38 17.26 2.09 0.32 0.19 2.59 0.14 90.00
sda 0.00 0.00 6016.00 303.00 47.18 3.95 16.57 2.30 0.36 0.23 3.12 0.15 94.40
sda 0.00 0.00 6236.00 1059.00 49.52 12.88 17.52 5.91 0.64 0.20 3.23 0.12 88.40
sda 0.00 0.00 105.00 26173.00 0.89 311.39 24.34 142.37 5.42 27.73 5.33 0.04 100.00
sda 0.00 0.00 78.00 27199.00 0.87 324.06 24.40 142.30 5.25 11.08 5.23 0.04 100.00
sda 0.00 0.00 10.00 33488.00 0.11 399.05 24.40 136.41 4.07 100.40 4.04 0.03 100.00
sda 0.00 0.00 3819.00 10096.00 31.14 120.47 22.31 42.80 3.10 0.32 4.15 0.07 96.00
sda 0.00 0.00 6482.00 346.00 52.98 4.53 17.25 1.93 0.28 0.20 1.80 0.14 93.20

While the fsync() is going on (or the kernel decides to start writing
out aggressively for some other reason) the amount of writes to the disk
is increased by two orders of magnitude. Unsurprisingly with disastrous
consequences for read() performance. We really want a way to pace the
writes issued to the disk more regularly.

The attached program right now can only be configured by changing some
details in the code itself, but I guess that's not a problem. It will
upfront allocate two files, and then start testing. If the files already
exists it will use them.

Possible solutions:
* Add a fadvise(UNDIRTY), that doesn't stall on a full IO queue like
sync_file_range() does.
* Make IO triggered by writeback regard IO priorities and add it to
schedulers other than CFQ
* Add a tunable that allows limiting the amount of dirty memory before
writeback on a per process basis.
* ...?

If somebody familiar with buffered IO writeback is around at LSF/MM, or
rather collab, Robert and I will be around for the next days.

Greetings,

Andres Freund
/*
* Portions Copyright (c) 2014, PostgreSQL Global Development Group
*
* Permission to use, copy, modify, and distribute this software and its
* documentation for any purpose, without fee, and without a written agreement
* is hereby granted, provided that the above copyright notice and this
* paragraph appear in all copies.
*
* Test program roughly simulating postgres' IO.
*
* Parameters will need need to be changed to reproduce the problem on
* individual systems.
*
* Author: Andres Freund, andres@xxxxxxxxxxxxxxx, andres@xxxxxxxxxxx
*/
#define _POSIX_C_SOURCE 200809L
#define _XOPEN_SOURCE 800

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h>

#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <time.h>

/* CHANGE: number of reading processes */
#define NUM_RANDOM_READERS 16

/*
* CHANGE: set to memory size * 2 or so.
*
* Remove the 'data', 'wal' files after changing.
*/
static const size_t data_size = 1024L * 1024 * 1024 * 48;

/* probably ok this way */
static const size_t wal_size = 1024L * 1024 * 1024 * 1;

/* after how many iterations should stuff get reported */
static const uint64_t read_report_interval = 10000;
static const uint64_t wal_report_interval = 1000;
static const uint64_t commit_report_interval = 500;


/* internal data */
static const char initdata[8192];
static pid_t readers[NUM_RANDOM_READERS];

struct timing
{
uint64_t iter;
uint64_t period_total;
uint64_t total;
uint64_t period_max;

struct timespec t_before;
struct timespec t_after;
};

static void
fatal_error(int e)
{
fprintf(stderr, "frak me: %d: %s\n", e, strerror(e));
_exit(0);
}

static void
nsleep(int64_t s)
{
struct timespec d;
d.tv_sec = 0;
d.tv_nsec = s;
if (nanosleep(&d, NULL) < 0)
fatal_error(errno);
}

static off_t
random_block(size_t end)
{
return (((double) random())/RAND_MAX) * (end - 1);
}

static int64_t
nsec_diff(const struct timespec *a, const struct timespec *b)
{
return ((int64_t)(a->tv_sec - b->tv_sec) * 1000000000)
+ (a->tv_nsec - b->tv_nsec);
}

static void
timing_init(struct timing *t)
{
t->iter = 0;
t->total = 0;
t->period_total = 0;
t->period_max = 0;
}

static void
timing_before_action(struct timing *t)
{
clock_gettime(CLOCK_MONOTONIC, &t->t_before);
}


static void
timing_after_action(struct timing *t, const char *ctx, int64_t report_interval)
{
uint64_t dur;

clock_gettime(CLOCK_MONOTONIC, &t->t_after);

dur = nsec_diff(&t->t_after, &t->t_before);

t->iter++;
t->period_total += dur;
t->period_max = t->period_max < dur ? dur : t->period_max;

if ((t->iter % report_interval) == 0)
{
fprintf(stdout, "%s[%d]: avg: %.1f msec; max: %.1f msec\n",
ctx, getpid(),
(double) (t->period_total / read_report_interval) / 1000000,
(double) t->period_max / 1000000);
t->total += t->period_total;
t->period_total = 0;
t->period_max = 0;
}
}

static void
do_wal_writes(void)
{
int fd;
off_t pos = 0;
int64_t iter = 0;

struct timing wal_timing;
struct timing commit_timing;

timing_init(&wal_timing);
timing_init(&commit_timing);

fd = open("wal", O_RDWR, S_IRUSR|S_IWUSR);
if (fd < 0)
fatal_error(errno);

while(true)
{
bool is_commit = (iter++ % 5) == 0;

if (lseek(fd, pos, SEEK_SET) < 0)
fatal_error(errno);

timing_before_action(&wal_timing);
if (is_commit)
timing_before_action(&commit_timing);

if (write(fd, initdata, 8192) < 0)
fatal_error(errno);

timing_after_action(&wal_timing, "wal", wal_report_interval);

if (is_commit)
{

if (fdatasync(fd) < 0)
fatal_error(errno);
timing_after_action(&commit_timing, "commit", commit_report_interval);
}

pos += 8192;

if (pos + 8192 >= wal_size)
pos = 0;

nsleep(1000000);
}
}

static void
do_checkpointer_writes(void)
{
int fd;
int64_t writes = 0;

fd = open("data", O_RDWR, S_IRUSR|S_IWUSR);
if (fd < 0)
fatal_error(errno);

while(true)
{
off_t pos = random_block(data_size);

if (lseek(fd, pos, SEEK_SET) < 0)
fatal_error(errno);

if (write(fd, initdata, 8192) < 0)
fatal_error(errno);

if ((++writes % 100000) == 0)
{
fprintf(stdout, "starting fsync() of files\n");

if (fsync(fd) < 0)
fatal_error(errno);

fprintf(stdout, "finished fsync() of files\n");
}

nsleep(200000);
}
}

static void
do_random_reads(void)
{
int fd;
struct timing timing;

timing_init(&timing);

fd = open("data", O_RDWR, S_IRUSR|S_IWUSR);
if (fd < 0)
fatal_error(errno);

while(true)
{
char data[8192];
off_t pos = random_block(data_size);

if (lseek(fd, pos, SEEK_SET) < 0)
fatal_error(errno);

timing_before_action(&timing);

if (read(fd, data, 8192) < 0)
fatal_error(errno);

timing_after_action(&timing, "read", read_report_interval);
}
}

static void
initialize_files(void)
{
int fd;
ssize_t data_size_written = 0;
ssize_t wal_size_written = 0;

/* initialize data file */
fd = open("data", O_CREAT|O_EXCL|O_RDWR, S_IRUSR|S_IWUSR);
if (fd < 0 && errno == EEXIST)
;
else if (fd < 0)
fatal_error(errno);
else
{
while (data_size_written <= data_size)
{
ssize_t ret = write(fd, initdata, sizeof(initdata));
if (ret == -1)
fatal_error(errno);
data_size_written += ret;
}
if (fsync(fd) < 0)
fatal_error(errno);
close(fd);
}

/* initialize wal file */
fd = open("wal", O_CREAT|O_EXCL|O_RDWR, S_IRUSR|S_IWUSR);
if (fd < 0 && errno == EEXIST)
;
else if (fd < 0)
fatal_error(errno);
else
{
while (wal_size_written <= wal_size)
{
ssize_t ret = write(fd, initdata, sizeof(initdata));
if (ret == -1)
fatal_error(errno);
wal_size_written += ret;
}
fsync(fd);
close(fd);
}
}

static pid_t
start_subprocess(void (*sub)(void))
{
pid_t pid;

pid = fork();
if (pid == -1)
fatal_error(errno);
else if (pid == 0)
sub();

return pid;
}

int
main(int argc, char **argv)
{
int status;
pid_t checkpointer_pid, wal_pid;

/*
* Don't want to hit the same, already cached, pages after restarting.
*/
srandom((int)time(NULL));

initialize_files();

checkpointer_pid = start_subprocess(do_checkpointer_writes);
wal_pid = start_subprocess(do_wal_writes);

/* start all reader processes */
for (int i = 0; i < NUM_RANDOM_READERS; i++)
readers[i] = start_subprocess(do_random_reads);

/* return if all subprocesses decided to die */
for (int i = 0; i < NUM_RANDOM_READERS; i++)
waitpid(readers[i], &status, 0);

waitpid(checkpointer_pid, &status, 0);
waitpid(wal_pid, &status, 0);

return 0;
}