BUG: mmapfile/writev spurious zero bytes (x86_64/not i386, bisected,reproducable)

From: Bron Gondwana
Date: Tue Jun 17 2008 - 02:16:59 EST


Background: we recently upgraded one of our 64bit kernel
machines to 2.6.25 and discovered that Cyrus skiplist
files were becoming randomly corrupted. This is a machine
with a 32bit Debian Etch userland but a 64bit kernel with
32bit support.

We run this way so we can actually use the 12Gb memory as
cache without running out of inode space, but don't have
to support two different sets of userland across our
servers.


The symptom - 16 to 24 bytes of zero appearing "randomly"
within the file after a "checkpoint" (file rewrite,
skipping stale copies of records).

Further investigation by retaining the pre-checkpoint files
showed that those bytes were the last ones of a page in the
original file.


Attached is a small C program which recreates the actions
that Cyrus takes, and uses record lengths identical to a
known-broken skiplist file on our systems.

Using this I was able to bisect the kernel to find the
commit which caused the problem:


08291429cfa6258c4cd95d8833beb40f828b194e is first bad commit
commit 08291429cfa6258c4cd95d8833beb40f828b194e
Author: Nick Piggin <npiggin@xxxxxxx>
Date: Tue Oct 16 01:24:59 2007 -0700

mm: fix pagecache write deadlocks

Modify the core write() code so that it won't take a pagefault while holding a
lock on the pagecache page. There are a number of different deadlocks possible
if we try to do such a thing:

[...]

Signed-off-by: Nick Piggin <npiggin@xxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>


For Cyrus users, this is a really serious bug - occasionally
the zeros will hit a "navigational component" of the file,
causing crashes and being noticeable. Most of the time
(including this example) it will just cause silent corruption
and data loss.

I suspect this will be visible to users as large swathes of
messages becoming unread, and if it hits the mailboxes.db,
large swathes of mailboxes just disappearing. Not good.


I apologise for the length of the attached C program. I tried
to make it shorter, but kept not tickling the bug. There's also
another advantage of keeping it like this - it closely mirrors the
Cyrus behaviour, to the point where the output is a valid skiplist
file.

It also has a "magic" mode, just pass a second parameter. It will
read through the mapped memory in order before the checkpoint.
This makes the bug disappear.

Let me know if there's anything else I can do to make this report
clearer. I've had a quick glance around the code, but especially
since it's 64 bit kernel only bug (I tested by rebooting the test
machine with a 2.6.25.3 32 bit kernel I had lying around and the
bug was not visible there. I've also repeated the test on my Ubuntu
desktop machine with the shipped kernel vs my hand-compiled
known-bad kernel, and tested with a 64bit userland in a chroot as
well).

Regards,

Bron.

--
Bron Gondwana
brong@xxxxxxxxxxx

/* This program is designed to test mmap file
* reads and iovec writes as per cyrus imap's
* skiplist database usage. It emulates the
* behaviour of a skiplist database (sans
* fcntl locking) across a file checkpoint */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <fcntl.h>

/* use some definitions out of the cyrus skiplist
* code to allow identical behaviour */

#define SKIPLIST_HEADER "\241\002\213\015skiplist file\0\0\0"

typedef unsigned int bit32;

#define WRITEV_ADD_TO_IOVEC(iov, num_iov, s, len) \
do { (iov)[(num_iov)].iov_base = (s); \
(iov)[(num_iov)++].iov_len = (len); } while (0)
/* bump to the next multiple of 4 bytes */
#define ROUNDUP(num) (((num) + 3) & 0xFFFFFFFC)

bit32 zero;
bit32 inorder;
bit32 add;
bit32 delete;
bit32 commit;
bit32 dummy;
bit32 max;

#define keylen 16
char *key;
#define vallen 6506
char *val;

void setup_vars() {
int i;

/* skiplist magic values */
zero = htonl(0);
inorder = htonl(1);
add = htonl(2);
delete = htonl(4);
commit = htonl(255);
dummy = htonl(257);
max = htonl((bit32)-1);

/* our key and value */
key = malloc(keylen * sizeof(char));
for (i = 0; i < keylen; i++) {
key[i] = 'k';
}

val = malloc(vallen * sizeof(char));
for (i = 0; i < vallen; i++) {
val[i] = 'b';
}
}

void write_dummy(int fd) {
char buffer[96];
int i;

for (i = 0; i < 96; i++)
buffer[i] = 0;

*(bit32 *)(buffer + 0) = dummy; /* type */
*(bit32 *)(buffer + 92) = max; /* end ptr list */

lseek(fd, 48, 0);
write(fd, buffer, 96);
}

void write_header(int fd) {
char buffer[48];
int i;

for (i = 0; i < 48; i++)
buffer[i] = 0;

strncpy(buffer, SKIPLIST_HEADER, 20);
*(bit32 *)(buffer + 20) = htonl((bit32) 1); /* version */
*(bit32 *)(buffer + 24) = htonl((bit32) 2); /* minorversion */
*(bit32 *)(buffer + 28) = htonl((bit32) 20); /* maxlevel */
*(bit32 *)(buffer + 32) = htonl((bit32) 1); /* curlevel */
*(bit32 *)(buffer + 36) = htonl((bit32) 0); /* numrecords */
*(bit32 *)(buffer + 40) = htonl((bit32)144); /* length at last checkpoint */
*(bit32 *)(buffer + 44) = htonl((bit32) 0); /* last recovery timestamp */

lseek(fd, 0, 0);
write(fd, buffer, 48);
}

int main(int argc, char **argv)
{
/* files */
char filename[1024];
int fd;
char newfilename[1024];
int newfd;

/* io vector */
struct iovec iov[50];
int num_iov;

/* mmap stuff */
char *mapbase;
int mapsize;
int maplen;

/* network order values */
bit32 netkeylen;
bit32 netvallen;
bit32 netoffset;

/* misc ints */
int i;
int offset;
int newoffset;
int size;
int magic;

setup_vars();

if (argc <= 1) {
printf("NEED A FILE\n");
exit(1);
}

strncpy(filename, argv[1], sizeof(filename));
unlink(filename);
fd = open(filename, O_RDWR | O_CREAT, 0666);

/* STAGE 1 - create a "pre-checkpointed" file that looks just like
* a real seen file before it gets updated */

write_header(fd);
write_dummy(fd);
fdatasync(fd);

/* mmap the file now */

maplen = 1024 * 16;
mapbase = (char *)mmap((caddr_t)0, maplen, PROT_READ, MAP_SHARED, fd, 0L);

/* create the initial checkpointed record */

num_iov = 0;
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&inorder, 4); /* header */

/* key */
netkeylen = htonl(keylen);
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&netkeylen, 4); /* length */
WRITEV_ADD_TO_IOVEC(iov, num_iov, key, keylen); /* data */
if (ROUNDUP(keylen) > keylen) { /* need to pad */
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, ROUNDUP(keylen) - keylen);
}

/* value */
netvallen = htonl(vallen);
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&netvallen, 4); /* length */
WRITEV_ADD_TO_IOVEC(iov, num_iov, val, vallen); /* data */
if (ROUNDUP(vallen) > vallen) { /* need to pad */
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, ROUNDUP(vallen) - vallen);
}

/* only one pointer, and then finish the record off */

WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, 4); /* pointer */
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&max, 4); /* end record */

/* write the record */
lseek(fd, 0, 2);
writev(fd, iov, num_iov);

/* update the dummy pointer */
lseek(fd, 60, 0);
netoffset = htonl(144);
write(fd, (char *)&netoffset, 4);

/* update the header */
netoffset = htonl(0x1a20);
lseek(fd, 40, 0);
write(fd, (char *)&netoffset, 4);

/* sync the record to disk */
fdatasync(fd);


/* CHECKPOINT 1 - the file on disk is now 100% identical to
* the file that would exist after loading the data above
* into a cyrus skiplist file and running the checkpoint code
* (ok, except the timestamp)
*/

/* Cyrus would check the MMAP at this point and discover it
* still fits all the data within the "slot", so it doesn't
* actually remap anything */

/* STAGE 2 - write a new record - starting by finding the
* pointers and deleting the old record. Simplified for
* a level 1 node here. This appends to the file.
*/

/* find the pointer record - Cyrus does this, so do we */
for (i = 20; i > 0; i--) {
offset = ntohl(*(bit32 *)(mapbase + 60 + 4*(i-1)));
if (offset) break;
}

/* test that the keys match */
if (strncmp(mapbase + offset + 4 + 4, key, keylen)) {
printf("ODD, key wasn't found\n");
}

num_iov = 0;

/* add the delete record */
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&delete, 4); /* header */
netoffset = htonl(offset);
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&netoffset, 4); /* offset to fix */

/* move offset along - this is "NEXT" of deleted record, will
* always be zero because there's only one record */
if (offset) {
int klen = ROUNDUP(ntohl(*(bit32 *)(mapbase + offset + 4)));
int vlen = ROUNDUP(ntohl(*(bit32 *)(mapbase + offset + 4 + 4 + klen)));
offset = ntohl(*(bit32 *)(mapbase + offset + 4 + 4 + klen + 4 + vlen));
}
if (offset != 0) {
printf("ODD, offset isn't 0 after deleting record\n");
}

/* mark the deletion */
lseek(fd, 60, 0);
write(fd, (char *)&zero, 4);

/* set up the new record */
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&add, 4); /* header */

/* key */
netkeylen = htonl(keylen);
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&netkeylen, 4); /* length */
WRITEV_ADD_TO_IOVEC(iov, num_iov, key, keylen); /* data */
if (ROUNDUP(keylen) > keylen) { /* need to pad */
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, ROUNDUP(keylen) - keylen);
}

/* value */
netvallen = htonl(vallen);
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&netvallen, 4); /* length */
WRITEV_ADD_TO_IOVEC(iov, num_iov, val, vallen); /* data */
if (ROUNDUP(vallen) > vallen) { /* need to pad */
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, ROUNDUP(vallen) - vallen);
}

/* only one pointer, and then finish the record off */

WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, 4); /* pointer */
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&max, 4); /* end record */

/* write the record */
offset = lseek(fd, 0, 2);
writev(fd, iov, num_iov);

/* update the dummy pointer */
lseek(fd, 60, 0);
netoffset = htonl(offset + 8); /* skip the delete */
write(fd, (char *)&netoffset, 4);
fdatasync(fd);

/* commit record goes on the end of the file only once all
* data changes are fsynced to disk */
lseek(fd, 0, 2);
write(fd, (char *)&commit, 4);
fdatasync(fd);

/* CHECKPOINT 2 - we've finished writing and committing the new record,
* and the Cyrus skiplist would checkpoint this file now.
* We simulate that here */

/* again - we fit insize the mmap space, so it doesn't get refreshed */

/* STAGE 3 - perform a cyrus checkpoint */

if (argc > 2) {
/* PERFORM MAGIC - access the mmap in order */
for (i = 0; i < maplen; i++) {
magic ^= mapbase[i];
}
}

/* stream to a new file */

snprintf(newfilename, sizeof(newfilename), "%s.NEW", filename);
newfd = open(newfilename, O_RDWR | O_CREAT, 0666);
ftruncate(newfd, 0);

/* dummy goes first, header gets written last */
write_dummy(newfd);

lseek(newfd, 0, 2); /* 144 */

/* again, Cyrus finds the record first */
for (i = 20; i > 0; i--) {
offset = ntohl(*(bit32 *)(mapbase + 60 + 4*(i-1)));
if (offset) break;
}

/* and calculates the length by reading the lengths of the
* various components and summing them up, finally walking
* the pointers until it hits a "-1"
*/
if (offset) {
int klen = ROUNDUP(ntohl(*(bit32 *)(mapbase + offset + 4)));
int vlen = ROUNDUP(ntohl(*(bit32 *)(mapbase + offset + 4 + 4 + klen)));
size = 4 + 4 + klen + 4 + vlen;
while (ntohl(*(bit32 *)(mapbase + offset + size)) != -1) {
size += 4;
}
size += 4; /* final -1 pointer */
}
else {
/* didn't find the record */
printf("Failed to find the record, dying\n");
exit(1);
}

num_iov = 0;
/* write the entire record, but change the TYPE field from
* whatever it was (ADD or INORDER) to INORDER only */
WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&inorder, 4); /* header */
WRITEV_ADD_TO_IOVEC(iov, num_iov, mapbase + offset + 4, size - 4); /* data */
writev(newfd, iov, num_iov);

/* write the forward pointer */
lseek(newfd, 60, 0);
netoffset = htonl(144);
write(newfd, (char *)&netoffset, 4);

/* write the zeros - first the one in the new record */
lseek(newfd, mapbase + offset + size - 8, 0);
write(newfd, (char *)&zero, 4);

/* then the rest into the dummy */
for (i = 1; i < 20; i++) {
lseek(newfd, 60 + 4*i, 0);
write(newfd, (char *)&zero, 4);
}

/* write the header last */
write_header(fd);

close(newfd);
close(fd);

exit(0);
}