[RFC PATCH] fpathconf() for fsync() behavior

From: Valerie Aurora Henson
Date: Wed Apr 22 2009 - 20:13:26 EST


In the default mode for ext3 and btrfs, fsync() is both slow and
unnecessary for some important application use cases - at the same
time that it is absolutely required for correctness for other modes of
ext3, ext4, XFS, etc. If applications could easilyl distinguish
between the two cases, they would be more likely to be correct and
fast.

How about an fpathconf() variable, something like _PC_ORDERED? E.g.:

/* Unoptimized example optional fsync() demo */
write(fd);
/* Only fsync() if we need it */
if (fpath_conf(fd, _PC_ORDERED) != 1)
fsync(fd);
rename(tmp_path, new_path);

I know of two specific real-world cases in which this would
significantly improve performance: (a) fsync() before rename(), (b)
fsync() of the parent directory of a newly created file. Case (b) is
particularly nasty when you have multiple threads creating files in
the same directory because the dir's i_mutex is held across fsync() -
file creates become limited to the speed of sequential fsync()s.

Conceptual libc patch below.

-VAL

diff --git a/sysdeps/unix/sysv/linux/pathconf.c b/sysdeps/unix/sysv/linux/pathconf.c
index db03529..5b64939 100644
--- a/sysdeps/unix/sysv/linux/pathconf.c
+++ b/sysdeps/unix/sysv/linux/pathconf.c
@@ -51,6 +51,9 @@ __pathconf (const char *file, int name)
case _PC_CHOWN_RESTRICTED:
return __statfs_chown_restricted (__statfs (file, &fsbuf), &fsbuf);

+ case _PC_ORDERED:
+ return __statfs_ordered (__statfs (file, &fsbuf), &fsbuf);
+
default:
return posix_pathconf (file, name);
}
@@ -225,3 +228,44 @@ __statfs_chown_restricted (int result, const struct statfs *fsbuf)

return retval;
}
+
+
+/* Tells us if write operations are ordered with respect to each
+ * other. Useful for skipping fsync in some cases. Default is 0 -
+ * not ordered. */
+
+/* Used like: return statfs_ordered (__statfs (name, &buf), &buf); */
+long int
+__statfs_ordered (int result, const struct statfs *fsbuf)
+{
+ if (result < 0)
+ {
+ if (errno == ENOSYS)
+ /* Not possible, return the default value. */
+ return 0;
+
+ /* Some error occured. */
+ return -1;
+ }
+
+#define BTRFS_SUPER_MAGIC 0x9123683E
+ switch (fsbuf->f_type)
+ {
+ case BTRFS_SUPER_MAGIC:
+ case EXT2_SUPER_MAGIC:
+ /* XXX Must distinguish between 2, 3, and 4 */
+ case REISERFS_SUPER_MAGIC:
+ /* XXX Nasty hacking needed here to determine exact
+ * journaling mode. Options include parsing /proc/mounts,
+ * defining an ioctl(), creating a generic VFS interface.
+ * For demonstration purposes, assume the default mode,
+ * which is ordered for each of these file systems.
+ */
+ return 1;
+ case XFS_SUPER_MAGIC:
+ /* XXX XFS has a trillion options, is there one to do ordered mode? */
+ return 0;
+ default:
+ return 0;
+ }
+}
diff --git a/bits/confname.h b/bits/confname.h
index 80b51ac..3d19902 100644
--- a/bits/confname.h
+++ b/bits/confname.h
@@ -39,6 +39,8 @@ enum
#define _PC_PIPE_BUF _PC_PIPE_BUF
_PC_CHOWN_RESTRICTED,
#define _PC_CHOWN_RESTRICTED _PC_CHOWN_RESTRICTED
+ _PC_ORDERED,
+#define _PC_ORDERED _PC_ORDERED
_PC_NO_TRUNC,
#define _PC_NO_TRUNC _PC_NO_TRUNC
_PC_VDISABLE,
diff --git a/conform/data/unistd.h-data b/conform/data/unistd.h-data
index b6effa0..7325ff5 100644
--- a/conform/data/unistd.h-data
+++ b/conform/data/unistd.h-data
@@ -248,6 +248,7 @@ constant _PC_MAX_CANON
constant _PC_MAX_INPUT
constant _PC_NAME_MAX
constant _PC_NO_TRUNC
+constant _PC_ORDERED
constant _PC_PATH_MAX
constant _PC_PIPE_BUF
constant _PC_PRIO_IO
diff --git a/posix/annexc.c b/posix/annexc.c
index df5913a..658bdc1 100644
--- a/posix/annexc.c
+++ b/posix/annexc.c
@@ -501,7 +501,7 @@ static const char *const unistd_syms[] =
"F_OK", "NULL", "R_OK", "SEEK_CUR", "SEEK_END", "SEEK_SET", "STDERR_FILENO",
"STDIN_FILENO", "STDOUT_FILENO", "W_OK", "X_OK",
"_PC_ASYNC_IO", "_PC_CHOWN_RESTRICTED", "_PC_LINK_MAX", "_PC_MAX_CANON",
- "_PC_MAX_INPUT", "_PC_NAME_MAX", "_PC_NO_TRUNC", "_PC_PATH_MAX",
+ "_PC_MAX_INPUT", "_PC_NAME_MAX", "_PC_NO_TRUNC", "_PC_PATH_MAX", "_PC_ORDERED",
"_PC_PIPE_BUF", "_PC_PRIO_IO", "_PC_SYNC_IO", "_PC_VDISABLE",
"_SC_AIO_LISTIO_MAX", "_SC_AIO_MAX", "_SC_AIO_PRIO_DELTA_MAX",
"_SC_ARG_MAX", "_SC_ASYNCHRONOUS_IO", "_SC_CHILD_MAX", "_SC_CLK_TCK",
diff --git a/posix/fpathconf.c b/posix/fpathconf.c
index 840460b..d7f9a89 100644
--- a/posix/fpathconf.c
+++ b/posix/fpathconf.c
@@ -47,6 +47,7 @@ __fpathconf (fd, name)
case _PC_PIPE_BUF:
case _PC_SOCK_MAXBUF:
case _PC_CHOWN_RESTRICTED:
+ case _PC_ORDERED:
case _PC_NO_TRUNC:
case _PC_VDISABLE:
break;
diff --git a/posix/getconf.c b/posix/getconf.c
index 6184292..5995d60 100644
--- a/posix/getconf.c
+++ b/posix/getconf.c
@@ -81,6 +81,9 @@ static const struct conf vars[] =
#ifdef _PC_CHOWN_RESTRICTED
{ "_POSIX_CHOWN_RESTRICTED", _PC_CHOWN_RESTRICTED, PATHCONF },
#endif
+#ifdef _PC_ORDERED
+ { "_POSIX_ORDERED", _PC_ORDERED, PATHCONF },
+#endif
#ifdef _PC_NO_TRUNC
{ "_POSIX_NO_TRUNC", _PC_NO_TRUNC, PATHCONF },
#endif
diff --git a/sysdeps/posix/fpathconf.c b/sysdeps/posix/fpathconf.c
index 605cd17..c29fa6f 100644
--- a/sysdeps/posix/fpathconf.c
+++ b/sysdeps/posix/fpathconf.c
@@ -121,6 +121,13 @@ __fpathconf (fd, name)
return -1;
#endif

+ case _PC_ORDERED:
+#ifdef _POSIX_ORDERED
+ return _POSIX_ORDERED;
+#else
+ return -1;
+#endif
+
case _PC_NO_TRUNC:
#ifdef _POSIX_NO_TRUNC
return _POSIX_NO_TRUNC;
diff --git a/sysdeps/posix/pathconf.c b/sysdeps/posix/pathconf.c
index 75c99ee..f9d84ab 100644
--- a/sysdeps/posix/pathconf.c
+++ b/sysdeps/posix/pathconf.c
@@ -117,6 +117,13 @@ __pathconf (const char *path, int name)
return -1;
#endif

+ case _PC_ORDERED:
+#ifdef _POSIX_ORDERED
+ return _POSIX_ORDERED;
+#else
+ return -1;
+#endif
+
case _PC_NO_TRUNC:
#ifdef _POSIX_NO_TRUNC
return _POSIX_NO_TRUNC;
diff --git a/sysdeps/unix/sysv/linux/fpathconf.c b/sysdeps/unix/sysv/linux/fpathconf.c
index 2701c9e..51c43c4 100644
--- a/sysdeps/unix/sysv/linux/fpathconf.c
+++ b/sysdeps/unix/sysv/linux/fpathconf.c
@@ -48,6 +48,9 @@ __fpathconf (fd, name)
case _PC_CHOWN_RESTRICTED:
return __statfs_chown_restricted (__fstatfs (fd, &fsbuf), &fsbuf);

+ case _PC_ORDERED:
+ return __statfs_ordered (__fstatfs (fd, &fsbuf), &fsbuf);
+
default:
return posix_fpathconf (fd, name);
}
diff --git a/sysdeps/unix/sysv/linux/pathconf.h b/sysdeps/unix/sysv/linux/pathconf.h
index 806adcc..1c0b513 100644
--- a/sysdeps/unix/sysv/linux/pathconf.h
+++ b/sysdeps/unix/sysv/linux/pathconf.h
@@ -37,3 +37,6 @@ extern long int __statfs_symlinks (int result, const struct statfs *fsbuf);
/* Used like: return __statfs_chown_restricted (__statfs (name, &buf), &buf);*/
extern long int __statfs_chown_restricted (int result,
const struct statfs *fsbuf);
+
+/* Used like: return statfs_ordered (__statfs (name, &buf), &buf); */
+extern long int __statfs_ordered (int result, const struct statfs *fsbuf);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/