[WIP] tux3: Optimized fsync

From: Daniel Phillips
Date: Thu May 14 2015 - 03:37:19 EST


Greetings,

This diff against head (f59558a04c5ad052dc03ceeda62ccf31f4ab0004) of

https://github.com/OGAWAHirofumi/linux-tux3/tree/hirofumi-user

provides the optimized fsync code that was used to generate the
benchmark results here:

https://lkml.org/lkml/2015/4/28/838
"How fast can we fsync?"

This patch also applies to:

https://github.com/OGAWAHirofumi/linux-tux3/tree/hirofumi

which is a 3.19 kernel cloned from mainline. (Preferred)

Build instructions are on the wiki:

https://github.com/OGAWAHirofumi/linux-tux3/wiki

There is some slight skew in the instructions because this is
not on master yet.

****************************************************************
***** Caveat: No out of space handling on this branch! *******
*** If you run out of space you will get a mysterious assert ***
****************************************************************

Enjoy!

Daniel

diff --git a/fs/tux3/buffer.c b/fs/tux3/buffer.c
index ef0d917..a141687 100644
--- a/fs/tux3/buffer.c
+++ b/fs/tux3/buffer.c
@@ -29,7 +29,7 @@ TUX3_DEFINE_STATE_FNS(unsigned long, buf, BUFDELTA_AVAIL, BUFDELTA_BITS,
* may not work on all arch (If set_bit() and cmpxchg() is not
* exclusive, this has race).
*/
-static void tux3_set_bufdelta(struct buffer_head *buffer, int delta)
+void tux3_set_bufdelta(struct buffer_head *buffer, int delta)
{
unsigned long state, old_state;

diff --git a/fs/tux3/commit.c b/fs/tux3/commit.c
index 909a222..955c441a 100644
--- a/fs/tux3/commit.c
+++ b/fs/tux3/commit.c
@@ -289,12 +289,13 @@ static int commit_delta(struct sb *sb)
req_flag |= REQ_NOIDLE | REQ_FLUSH | REQ_FUA;
}

- trace("commit %i logblocks", be32_to_cpu(sb->super.logcount));
+ trace("commit %i logblocks", logcount(sb));
err = save_metablock(sb, req_flag);
if (err)
return err;

- tux3_wake_delta_commit(sb);
+ if (!fsync_mode(sb))
+ tux3_wake_delta_commit(sb);

/* Commit was finished, apply defered bfree. */
return unstash(sb, &sb->defree, apply_defered_bfree);
@@ -314,8 +315,7 @@ static void post_commit(struct sb *sb, unsigned delta)

static int need_unify(struct sb *sb)
{
- static unsigned crudehack;
- return !(++crudehack % 3);
+ return logcount(sb) > 300; /* FIXME: should be based on bandwidth and tunable */
}

/* For debugging */
@@ -359,7 +359,7 @@ static int do_commit(struct sb *sb, int flags)
* FIXME: there is no need to commit if normal inodes are not
* dirty? better way?
*/
- if (!(flags & __FORCE_DELTA) && !tux3_has_dirty_inodes(sb, delta))
+ if (0 && !(flags & __FORCE_DELTA) && !tux3_has_dirty_inodes(sb, delta))
goto out;

/* Prepare to wait I/O */
@@ -402,6 +402,7 @@ static int do_commit(struct sb *sb, int flags)
#endif

if ((!no_unify && need_unify(sb)) || (flags & __FORCE_UNIFY)) {
+ trace("unify %u, delta %u", sb->unify, delta);
err = unify_log(sb);
if (err)
goto error; /* FIXME: error handling */
diff --git a/fs/tux3/commit_flusher.c b/fs/tux3/commit_flusher.c
index 59d6781..31cd51e 100644
--- a/fs/tux3/commit_flusher.c
+++ b/fs/tux3/commit_flusher.c
@@ -198,6 +198,8 @@ long tux3_writeback(struct super_block *super, struct bdi_writeback *wb,
if (work->reason == WB_REASON_SYNC)
goto out;

+ trace("tux3_writeback, reason = %i", work->reason);
+
if (work->reason == WB_REASON_TUX3_PENDING) {
struct tux3_wb_work *wb_work;
/* Specified target delta for staging. */
@@ -343,3 +345,7 @@ static void schedule_flush_delta(struct sb *sb, struct delta_ref *delta_ref)
sb->delta_pending++;
wake_up_all(&sb->delta_transition_wq);
}
+
+#ifdef __KERNEL__
+#include "commit_fsync.c"
+#endif
diff --git a/fs/tux3/commit_fsync.c b/fs/tux3/commit_fsync.c
new file mode 100644
index 0000000..9a59c59
--- /dev/null
+++ b/fs/tux3/commit_fsync.c
@@ -0,0 +1,341 @@
+/*
+ * Optimized fsync.
+ *
+ * Copyright (c) 2015 Daniel Phillips
+ */
+
+#include <linux/delay.h>
+
+static inline int fsync_pending(struct sb *sb)
+{
+ return atomic_read(&sb->fsync_pending);
+}
+
+static inline int delta_needed(struct sb *sb)
+{
+ return waitqueue_active(&sb->delta_transition_wq);
+}
+
+static inline int fsync_drain(struct sb *sb)
+{
+ return test_bit(TUX3_FSYNC_DRAIN_BIT, &sb->backend_state);
+}
+
+static inline unsigned fsync_group(struct sb *sb)
+{
+ return atomic_read(&sb->fsync_group);
+}
+
+static int suspend_transition(struct sb *sb)
+{
+ while (sb->suspended == NULL) {
+ if (!test_and_set_bit(TUX3_STATE_TRANSITION_BIT, &sb->backend_state)) {
+ sb->suspended = delta_get(sb);
+ return 1;
+ }
+ cpu_relax();
+ }
+ return 0;
+}
+
+static void resume_transition(struct sb *sb)
+{
+ delta_put(sb, sb->suspended);
+ sb->suspended = NULL;
+
+ if (need_unify(sb))
+ delta_transition(sb);
+
+ /* Make sure !suspended is visible before transition clear */
+ smp_mb__before_atomic();
+ clear_bit(TUX3_STATE_TRANSITION_BIT, &sb->backend_state);
+ /* Make sure transition clear is visible before drain clear */
+ smp_mb__before_atomic();
+ clear_bit(TUX3_FSYNC_DRAIN_BIT, &sb->backend_state);
+ wake_up_all(&sb->delta_transition_wq);
+}
+
+static void tux3_wait_for_free(struct sb *sb, unsigned delta)
+{
+ unsigned free_delta = delta + TUX3_MAX_DELTA;
+ /* FIXME: better to be killable */
+ wait_event(sb->delta_transition_wq,
+ delta_after_eq(sb->delta_free, free_delta));
+}
+
+/*
+ * Write log and commit. (Mostly borrowed from do_commit)
+ *
+ * This needs specfic handling for the commit block, so
+ * maybe add an fsync flag to commit_delta.
+ */
+static int commit_fsync(struct sb *sb, unsigned delta, struct blk_plug *plug)
+{
+ write_btree(sb, delta);
+ write_log(sb);
+ blk_finish_plug(plug);
+ commit_delta(sb);
+ post_commit(sb, delta);
+ return 0;
+}
+
+enum { groups_per_commit = 4 };
+
+/*
+ * Backend fsync commit task, serialized with delta backend.
+ */
+void fsync_backend(struct work_struct *work)
+{
+ struct sb *sb = container_of(work, struct fsync_work, work)->sb;
+ struct syncgroup *back = &sb->fsync[(fsync_group(sb) - 1) % fsync_wrap];
+ struct syncgroup *front = &sb->fsync[fsync_group(sb) % fsync_wrap];
+ struct syncgroup *idle = &sb->fsync[(fsync_group(sb) + 1) % fsync_wrap];
+ unsigned back_delta = sb->suspended->delta - 1;
+ unsigned start = fsync_group(sb), groups = 0;
+ struct blk_plug plug;
+ int err; /* How to report?? */
+
+ trace("enter fsync backend, delta = %i", sb->suspended->delta);
+ tux3_start_backend(sb);
+ sb->flags |= SB_FSYNC_FLUSH_FLAG;
+
+ while (1) {
+ sb->ioinfo = NULL;
+ assert(list_empty(&tux3_sb_ddc(sb, back_delta)->dirty_inodes));
+ while (atomic_read(&front->busy)) {
+ struct ioinfo ioinfo;
+ unsigned i;
+ /*
+ * Verify that the tail of the group queue is idle in
+ * the sense that all waiting fsyncs woke up and released
+ * their busy counts. This busy wait is only theoretical
+ * because fsync tasks have plenty of time to wake up
+ * while the the next group commits to media, but handle
+ * it anyway for completeness.
+ */
+ for (i = 0; atomic_read(&idle->busy); i++)
+ usleep_range(10, 1000);
+ if (i)
+ tux3_warn(sb, "*** %u spins on queue full ***", i);
+ reinit_completion(&idle->wait);
+
+ /*
+ * Bump the fsync group counter so fsync backend owns the
+ * next group of fsync inodes and can walk stable lists
+ * while new fsyncs go onto the new frontend lists.
+ */
+ spin_lock(&sb->fsync_lock);
+ atomic_inc(&sb->fsync_group);
+ spin_unlock(&sb->fsync_lock);
+
+ back = front;
+ front = idle;
+ idle = &sb->fsync[(fsync_group(sb) + 1) % fsync_wrap];
+
+ trace("fsync flush group %tu, queued = %i, busy = %i",
+ back - sb->fsync, atomic_read(&sb->fsync_pending),
+ atomic_read(&back->busy));
+
+ if (!sb->ioinfo) {
+ tux3_io_init(&ioinfo, REQ_SYNC);
+ sb->ioinfo = &ioinfo;
+ blk_start_plug(&plug);
+ }
+
+ /*
+ * NOTE: this may flush same inode multiple times, and those
+ * blocks are submitted under plugging. So, by reordering,
+ * later requests by tux3_flush_inodes() can be flushed
+ * before former submitted requests. We do page forking, and
+ * don't free until commit, so reorder should not be problem.
+ * But we should remember this surprise.
+ */
+ err = tux3_flush_inodes_list(sb, back_delta, &back->list);
+ if (err) {
+ tux3_warn(sb, "tux3_flush_inodes_list error %i!", -err);
+ goto ouch;
+ }
+ list_splice_init(&back->list, &tux3_sb_ddc(sb, back_delta)->dirty_inodes);
+ atomic_sub(atomic_read(&back->busy), &sb->fsync_pending);
+
+ if (++groups < groups_per_commit && atomic_read(&front->busy)) {
+ trace("fsync merge group %u", fsync_group(sb));
+ continue;
+ }
+
+ commit_fsync(sb, back_delta, &plug);
+ sb->ioinfo = NULL;
+ wake_up_all(&sb->fsync_collide);
+
+ /*
+ * Wake up commit waiters for all groups in this commit.
+ */
+ trace("complete %i groups, %i to %i", groups, start, start + groups -1);
+ for (i = 0; i < groups; i++) {
+ struct syncgroup *done = &sb->fsync[(start + i) % fsync_wrap];
+ complete_all(&done->wait);
+ }
+
+ if (!fsync_pending(sb) || delta_needed(sb) || need_unify(sb))
+ set_bit(TUX3_FSYNC_DRAIN_BIT, &sb->backend_state);
+
+ start = fsync_group(sb);
+ groups = 0;
+ }
+
+ if (fsync_drain(sb) && !fsync_pending(sb))
+ break;
+
+ usleep_range(10, 500);
+ }
+
+ouch:
+ tux3_end_backend();
+ sb->flags &= ~SB_FSYNC_FLUSH_FLAG;
+ resume_transition(sb);
+ trace("leave fsync backend, group = %i", fsync_group(sb));
+ return; /* FIXME: error? */
+}
+
+int tux3_sync_inode(struct sb *sb, struct inode *inode)
+{
+ void tux3_set_bufdelta(struct buffer_head *buffer, int delta);
+ struct tux3_inode *tuxnode = tux_inode(inode);
+ struct inode_delta_dirty *front_dirty, *back_dirty;
+ struct buffer_head *buffer;
+ struct syncgroup *front;
+ unsigned front_delta;
+ int err = 0, start_backend = 0;
+
+ trace("fsync inode %Lu", (long long)tuxnode->inum);
+
+ /*
+ * Prevent new fsyncs from queuing if fsync_backend wants to exit.
+ */
+ if (fsync_drain(sb))
+ wait_event(sb->delta_transition_wq, !fsync_drain(sb));
+
+ /*
+ * Prevent fsync_backend from exiting and delta from changing until
+ * this fsync is queued and flushed.
+ */
+ atomic_inc(&sb->fsync_pending);
+ start_backend = suspend_transition(sb);
+ front_delta = sb->suspended->delta;
+ front_dirty = tux3_inode_ddc(inode, front_delta);
+ back_dirty = tux3_inode_ddc(inode, front_delta - 1);
+ tux3_wait_for_free(sb, front_delta - 1);
+
+ /*
+ * If another fsync is in progress on this inode then wait to
+ * avoid block collisions.
+ */
+ if (tux3_inode_test_and_set_flag(TUX3_INODE_FSYNC_BIT, inode)) {
+ trace("parallel fsync of inode %Lu", (long long)tuxnode->inum);
+ if (start_backend) {
+ queue_work(sb->fsync_workqueue, &sb->fsync_work.work);
+ start_backend = 0;
+ }
+ err = wait_event_killable(sb->fsync_collide,
+ !tux3_inode_test_and_set_flag(TUX3_INODE_FSYNC_BIT, inode));
+ if (err) {
+ tux3_inode_clear_flag(TUX3_INODE_FSYNC_BIT, inode);
+ atomic_dec(&sb->fsync_pending);
+ goto fail;
+ }
+ }
+
+ /*
+ * We own INODE_FSYNC and the delta backend is not running so
+ * if inode is dirty here then it it will still be dirty when we
+ * move it to the backend dirty list. Otherwise, the inode is
+ * clean and fsync should exit here. We owned INODE_FSYNC for a
+ * short time so there might be tasks waiting on fsync_collide.
+ * Similarly, we might own FSYNC_RUNNING and therefore must start
+ * the fsync backend in case some other task failed to own it and
+ * therefore assumes it is running.
+ */
+ if (!tux3_dirty_flags1(inode, front_delta)) {
+ trace("inode %Lu is already clean", (long long)tuxnode->inum);
+ tux3_inode_clear_flag(TUX3_INODE_FSYNC_BIT, inode);
+ atomic_dec(&sb->fsync_pending);
+ if (start_backend)
+ queue_work(sb->fsync_workqueue, &sb->fsync_work.work);
+ wake_up_all(&sb->fsync_collide);
+ return 0;
+ }
+
+ /*
+ * Exclude new dirties.
+ * Lock order: i_mutex => truncate_lock
+ */
+ mutex_lock(&inode->i_mutex); /* Exclude most dirty sources */
+ down_write(&tux_inode(inode)->truncate_lock); /* Exclude mmap */
+
+ /*
+ * Force block dirty state to previous delta for each dirty
+ * block so block fork protects block data against modify by
+ * parallel tasks while this task waits for commit.
+ *
+ * This walk should not discover any dirty blocks belonging
+ * to the previous delta due to the above wait for delta
+ * commit.
+ */
+ list_for_each_entry(buffer, &front_dirty->dirty_buffers, b_assoc_buffers) {
+ //assert(tux3_bufsta_get_delta(buffer->b_state) != delta - 1);
+ tux3_set_bufdelta(buffer, front_delta - 1);
+ }
+
+ /*
+ * Move the the front end dirty block list to the backend, which
+ * is now empty because the previous delta was completed. Remove
+ * the inode from the frontend dirty list and add it to the front
+ * fsync list. Note: this is not a list move because different
+ * link fields are involved. Later, the inode will be moved to
+ * the backend inode dirty list to be flushed but we cannot put
+ * it there right now because it might clobber the previous fsync
+ * group. Update the inode dirty flags to indicate the inode is
+ * dirty in the back, not the front. The list moves must be
+ * under the spin lock to prevent the back end from bumping
+ * the group counter and proceeding with the commit.
+ */
+ trace("fsync queue inode %Lu to group %u",
+ (long long)tuxnode->inum, fsync_group(sb));
+ spin_lock(&tuxnode->lock);
+ spin_lock(&sb->dirty_inodes_lock);
+ //assert(<inode is not dirty in back>);
+ assert(list_empty(&back_dirty->dirty_buffers));
+ assert(list_empty(&back_dirty->dirty_holes));
+ assert(!list_empty(&front_dirty->dirty_list));
+ list_splice_init(&front_dirty->dirty_buffers, &back_dirty->dirty_buffers);
+ list_splice_init(&front_dirty->dirty_holes, &back_dirty->dirty_holes);
+ list_del_init(&front_dirty->dirty_list);
+ spin_unlock(&sb->dirty_inodes_lock);
+
+ tux3_dirty_switch_to_prev(inode, front_delta);
+ spin_unlock(&tuxnode->lock);
+
+ spin_lock(&sb->fsync_lock);
+ front = &sb->fsync[fsync_group(sb) % fsync_wrap];
+ list_add_tail(&back_dirty->dirty_list, &front->list);
+ atomic_inc(&front->busy); /* detect queue full */
+ assert(sb->current_delta->delta == front_delta); /* last chance to check */
+ spin_unlock(&sb->fsync_lock);
+
+ /*
+ * Allow more dirties during the wait. These will be isolated from
+ * the commit by block forking.
+ */
+ up_write(&tux_inode(inode)->truncate_lock);
+ mutex_unlock(&inode->i_mutex);
+
+ if (start_backend)
+ queue_work(sb->fsync_workqueue, &sb->fsync_work.work);
+
+ wait_for_completion(&front->wait);
+ atomic_dec(&front->busy);
+fail:
+ if (err)
+ tux3_warn(sb, "error %i!!!", err);
+ return err;
+}
diff --git a/fs/tux3/iattr.c b/fs/tux3/iattr.c
index 57a383b..7ac73f5 100644
--- a/fs/tux3/iattr.c
+++ b/fs/tux3/iattr.c
@@ -276,6 +276,8 @@ static int iattr_decode(struct btree *btree, void *data, void *attrs, int size)
}

decode_attrs(inode, attrs, size); // error???
+ tux_inode(inode)->nlink_base = inode->i_nlink;
+
if (tux3_trace)
dump_attrs(inode);
if (tux_inode(inode)->xcache)
diff --git a/fs/tux3/inode.c b/fs/tux3/inode.c
index f747c0e..a10ce38 100644
--- a/fs/tux3/inode.c
+++ b/fs/tux3/inode.c
@@ -922,22 +922,18 @@ void iget_if_dirty(struct inode *inode)
atomic_inc(&inode->i_count);
}

+enum { fsync_fallback = 0 };
+
/* Synchronize changes to a file and directory. */
int tux3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct sb *sb = tux_sb(inode->i_sb);

- /* FIXME: this is sync(2). We should implement real one */
- static int print_once;
- if (!print_once) {
- print_once++;
- tux3_warn(sb,
- "fsync(2) fall-back to sync(2): %Lx-%Lx, datasync %d",
- start, end, datasync);
- }
+ if (fsync_fallback || S_ISDIR(inode->i_mode))
+ return sync_current_delta(sb);

- return sync_current_delta(sb);
+ return tux3_sync_inode(sb, inode);
}

int tux3_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
diff --git a/fs/tux3/log.c b/fs/tux3/log.c
index bb26c73..a934659 100644
--- a/fs/tux3/log.c
+++ b/fs/tux3/log.c
@@ -83,6 +83,7 @@ unsigned log_size[] = {
[LOG_BNODE_FREE] = 7,
[LOG_ORPHAN_ADD] = 9,
[LOG_ORPHAN_DEL] = 9,
+ [LOG_FSYNC_ORPHAN] = 9,
[LOG_FREEBLOCKS] = 7,
[LOG_UNIFY] = 1,
[LOG_DELTA] = 1,
@@ -470,6 +471,11 @@ void log_bnode_free(struct sb *sb, block_t bnode)
log_u48(sb, LOG_BNODE_FREE, bnode);
}

+void log_fsync_orphan(struct sb *sb, unsigned version, tuxkey_t inum)
+{
+ log_u16_u48(sb, LOG_FSYNC_ORPHAN, version, inum);
+}
+
/*
* Handle inum as orphan inode
* (this is log of frontend operation)
diff --git a/fs/tux3/orphan.c b/fs/tux3/orphan.c
index 68d08e8..3ea2d6a 100644
--- a/fs/tux3/orphan.c
+++ b/fs/tux3/orphan.c
@@ -336,7 +336,30 @@ static int load_orphan_inode(struct sb *sb, inum_t inum, struct list_head *head)
tux3_mark_inode_orphan(tux_inode(inode));
/* List inode up, then caller will decide what to do */
list_add(&tux_inode(inode)->orphan_list, head);
+ return 0;
+}

+int replay_fsync_orphan(struct replay *rp, unsigned version, inum_t inum)
+{
+ struct sb *sb = rp->sb;
+ struct inode *inode = tux3_iget(sb, inum);
+ if (IS_ERR(inode)) {
+ int err = PTR_ERR(inode);
+ return err == -ENOENT ? 0 : err;
+ }
+
+ /*
+ * Multiple fsyncs of new inode can create multiple fsync orphan
+ * log records for the same inode. A later delta may have added a
+ * link.
+ */
+ if (inode->i_nlink != 0 || tux3_inode_is_orphan(tux_inode(inode))) {
+ iput(inode);
+ return 0;
+ }
+
+ tux3_mark_inode_orphan(tux_inode(inode));
+ list_add(&tux_inode(inode)->orphan_list, &rp->orphan_in_otree);
return 0;
}

diff --git a/fs/tux3/replay.c b/fs/tux3/replay.c
index f1f77e8..99361d6 100644
--- a/fs/tux3/replay.c
+++ b/fs/tux3/replay.c
@@ -29,6 +29,7 @@ static const char *const log_name[] = {
X(LOG_BNODE_FREE),
X(LOG_ORPHAN_ADD),
X(LOG_ORPHAN_DEL),
+ X(LOG_FSYNC_ORPHAN),
X(LOG_FREEBLOCKS),
X(LOG_UNIFY),
X(LOG_DELTA),
@@ -117,20 +118,20 @@ static void replay_unpin_logblocks(struct sb *sb, unsigned i, unsigned logcount)
static struct replay *replay_prepare(struct sb *sb)
{
block_t logchain = be64_to_cpu(sb->super.logchain);
- unsigned i, logcount = be32_to_cpu(sb->super.logcount);
+ unsigned i, count = logcount(sb);
struct replay *rp;
struct buffer_head *buffer;
int err;

/* FIXME: this address array is quick hack. Rethink about log
* block management and log block address. */
- rp = alloc_replay(sb, logcount);
+ rp = alloc_replay(sb, count);
if (IS_ERR(rp))
return rp;

/* FIXME: maybe, we should use bufvec to read log blocks */
- trace("load %u logblocks", logcount);
- i = logcount;
+ trace("load %u logblocks", count);
+ i = count;
while (i-- > 0) {
struct logblock *log;

@@ -156,7 +157,7 @@ static struct replay *replay_prepare(struct sb *sb)

error:
free_replay(rp);
- replay_unpin_logblocks(sb, i, logcount);
+ replay_unpin_logblocks(sb, i, count);

return ERR_PTR(err);
}
@@ -169,7 +170,7 @@ static void replay_done(struct replay *rp)
clean_orphan_list(&rp->log_orphan_add); /* for error path */
free_replay(rp);

- sb->logpos.next = be32_to_cpu(sb->super.logcount);
+ sb->logpos.next = logcount(sb);
replay_unpin_logblocks(sb, 0, sb->logpos.next);
log_finish_cycle(sb, 0);
}
@@ -319,6 +320,7 @@ static int replay_log_stage1(struct replay *rp, struct buffer_head *logbuf)
case LOG_BFREE_RELOG:
case LOG_LEAF_REDIRECT:
case LOG_LEAF_FREE:
+ case LOG_FSYNC_ORPHAN:
case LOG_ORPHAN_ADD:
case LOG_ORPHAN_DEL:
case LOG_UNIFY:
@@ -450,6 +452,7 @@ static int replay_log_stage2(struct replay *rp, struct buffer_head *logbuf)
return err;
break;
}
+ case LOG_FSYNC_ORPHAN:
case LOG_ORPHAN_ADD:
case LOG_ORPHAN_DEL:
{
@@ -459,6 +462,9 @@ static int replay_log_stage2(struct replay *rp, struct buffer_head *logbuf)
data = decode48(data, &inum);
trace("%s: version 0x%x, inum 0x%Lx",
log_name[code], version, inum);
+ if (code == LOG_FSYNC_ORPHAN)
+ err = replay_fsync_orphan(rp, version, inum);
+ else
if (code == LOG_ORPHAN_ADD)
err = replay_orphan_add(rp, version, inum);
else
@@ -514,11 +520,11 @@ static int replay_logblocks(struct replay *rp, replay_log_t replay_log_func)
{
struct sb *sb = rp->sb;
struct logpos *logpos = &sb->logpos;
- unsigned logcount = be32_to_cpu(sb->super.logcount);
+ unsigned count = logcount(sb);
int err;

logpos->next = 0;
- while (logpos->next < logcount) {
+ while (logpos->next < count) {
trace("log block %i, blocknr %Lx, unify %Lx",
logpos->next, rp->blocknrs[logpos->next],
rp->unify_index);
diff --git a/fs/tux3/super.c b/fs/tux3/super.c
index b104dc7..0913d26 100644
--- a/fs/tux3/super.c
+++ b/fs/tux3/super.c
@@ -63,6 +63,7 @@ static void tux3_inode_init_always(struct tux3_inode *tuxnode)
tuxnode->xcache = NULL;
tuxnode->generic = 0;
tuxnode->state = 0;
+ tuxnode->nlink_base = 0;
#ifdef __KERNEL__
tuxnode->io = NULL;
#endif
@@ -246,6 +247,9 @@ static void __tux3_put_super(struct sb *sbi)
sbi->idefer_map = NULL;
/* FIXME: add more sanity check */
assert(link_empty(&sbi->forked_buffers));
+
+ if (sbi->fsync_workqueue)
+ destroy_workqueue(sbi->fsync_workqueue);
}

static struct inode *create_internal_inode(struct sb *sbi, inum_t inum,
@@ -384,6 +388,21 @@ static int init_sb(struct sb *sb)
for (i = 0; i < ARRAY_SIZE(sb->s_ddc); i++)
INIT_LIST_HEAD(&sb->s_ddc[i].dirty_inodes);

+ for (i = 0; i < fsync_wrap; i++) {
+ INIT_LIST_HEAD(&sb->fsync[i].list);
+ init_completion(&sb->fsync[i].wait);
+ atomic_set(&sb->fsync[i].busy, 0);
+ }
+
+ if (!(sb->fsync_workqueue = create_workqueue("tux3-work")))
+ return -ENOMEM;
+
+ atomic_set(&sb->fsync_group, 0);
+ atomic_set(&sb->fsync_pending, 0);
+ spin_lock_init(&sb->fsync_lock);
+ init_waitqueue_head(&sb->fsync_collide);
+ INIT_WORK(&sb->fsync_work.work, fsync_backend);
+ sb->fsync_work.sb = sb;
sb->idefer_map = tux3_alloc_idefer_map();
if (!sb->idefer_map)
return -ENOMEM;
@@ -773,7 +792,7 @@ static int tux3_fill_super(struct super_block *sb, void *data, int silent)
goto error;
}
}
- tux3_dbg("s_blocksize %lu", sb->s_blocksize);
+ tux3_dbg("s_blocksize %lu, sb = %p", sb->s_blocksize, tux_sb(sb));

rp = tux3_init_fs(sbi);
if (IS_ERR(rp)) {
@@ -781,6 +800,7 @@ static int tux3_fill_super(struct super_block *sb, void *data, int silent)
goto error;
}

+ sb->s_flags |= MS_ACTIVE;
err = replay_stage3(rp, 1);
if (err) {
rp = NULL;
diff --git a/fs/tux3/tux3.h b/fs/tux3/tux3.h
index e2f2d9b..cf4bcc6 100644
--- a/fs/tux3/tux3.h
+++ b/fs/tux3/tux3.h
@@ -252,6 +252,7 @@ enum {
LOG_BNODE_FREE, /* Log of freeing bnode */
LOG_ORPHAN_ADD, /* Log of adding orphan inode */
LOG_ORPHAN_DEL, /* Log of deleting orphan inode */
+ LOG_FSYNC_ORPHAN, /* Log inode fsync with no links */
LOG_FREEBLOCKS, /* Log of freeblocks in bitmap on unify */
LOG_UNIFY, /* Log of marking unify */
LOG_DELTA, /* just for debugging */
@@ -310,6 +311,29 @@ struct tux3_mount_opt {
unsigned int flags;
};

+/* Per fsync group dirty inodes and synchronization */
+struct syncgroup {
+ struct list_head list; /* dirty inodes */
+ struct completion wait; /* commit wait */
+ atomic_t busy; /* fsyncs not completed */
+};
+
+struct fsync_work {
+ struct work_struct work;
+ struct sb *sb;
+};
+
+enum { fsync_wrap = 1 << 4 }; /* Maximum fsync groups in flight */
+
+enum sb_state_bits {
+ TUX3_STATE_TRANSITION_BIT,
+ TUX3_FSYNC_DRAIN_BIT, /* force fsync queue to drain */
+};
+
+enum sb_flag_bits {
+ SB_FSYNC_FLUSH_FLAG = 1 << 0, /* fsync specific actions on flush path */
+};
+
struct tux3_idefer_map;
/* Tux3-specific sb is a handle for the entire volume state */
struct sb {
@@ -321,10 +345,8 @@ struct sb {
struct delta_ref __rcu *current_delta; /* current delta */
struct delta_ref delta_refs[TUX3_MAX_DELTA];
unsigned unify; /* log unify cycle */
-
-#define TUX3_STATE_TRANSITION_BIT 0
unsigned long backend_state; /* delta state */
-
+ unsigned long flags; /* non atomic state */
#ifdef TUX3_FLUSHER_SYNC
struct rw_semaphore delta_lock; /* delta transition exclusive */
#else
@@ -403,7 +425,28 @@ struct sb {
#else
struct super_block vfs_sb; /* Userland superblock */
#endif
-};
+ /*
+ * Fsync and fsync backend
+ */
+ spinlock_t fsync_lock;
+ wait_queue_head_t fsync_collide; /* parallel fsync on same inode */
+ atomic_t fsync_group; /* current fsync group */
+ atomic_t fsync_pending; /* fsyncs started but not yet queued */
+ struct syncgroup fsync[fsync_wrap]; /* fsync commit groups */
+ struct workqueue_struct *fsync_workqueue;
+ struct fsync_work fsync_work;
+ struct delta_ref *suspended;
+ };
+
+static inline int fsync_mode(struct sb *sb)
+{
+ return sb->flags & SB_FSYNC_FLUSH_FLAG;
+}
+
+static inline unsigned logcount(struct sb *sb)
+{
+ return be32_to_cpu(sb->super.logcount);
+}

/* Block segment (physical block extent) info */
#define BLOCK_SEG_HOLE (1 << 0)
@@ -475,6 +518,7 @@ struct tux3_inode {
};

/* Per-delta dirty data for inode */
+ unsigned nlink_base; /* link count on media for fsync */
unsigned state; /* inode dirty state */
unsigned present; /* Attributes decoded from or
* to be encoded to itree */
@@ -553,6 +597,8 @@ static inline struct list_head *tux3_dirty_buffers(struct inode *inode,
enum {
/* Deferred inum allocation, and not stored into itree yet. */
TUX3_I_DEFER_INUM = 0,
+ /* Fsync in progress (protected by i_mutex) */
+ TUX3_INODE_FSYNC_BIT = 1,

/* No per-delta buffers, and no page forking */
TUX3_I_NO_DELTA = 29,
@@ -579,6 +625,11 @@ static inline void tux3_inode_clear_flag(int bit, struct inode *inode)
clear_bit(bit, &tux_inode(inode)->flags);
}

+static inline int tux3_inode_test_and_set_flag(int bit, struct inode *inode)
+{
+ return test_and_set_bit(bit, &tux_inode(inode)->flags);
+}
+
static inline int tux3_inode_test_flag(int bit, struct inode *inode)
{
return test_bit(bit, &tux_inode(inode)->flags);
@@ -723,6 +774,8 @@ static inline block_t bufindex(struct buffer_head *buffer)
/* commit.c */
long tux3_writeback(struct super_block *super, struct bdi_writeback *wb,
struct wb_writeback_work *work);
+int tux3_sync_inode(struct sb *sb, struct inode *inode);
+void fsync_backend(struct work_struct *work);

/* dir.c */
extern const struct file_operations tux_dir_fops;
@@ -967,6 +1020,7 @@ void log_bnode_merge(struct sb *sb, block_t src, block_t dst);
void log_bnode_del(struct sb *sb, block_t node, tuxkey_t key, unsigned count);
void log_bnode_adjust(struct sb *sb, block_t node, tuxkey_t from, tuxkey_t to);
void log_bnode_free(struct sb *sb, block_t bnode);
+void log_fsync_orphan(struct sb *sb, unsigned version, tuxkey_t inum);
void log_orphan_add(struct sb *sb, unsigned version, tuxkey_t inum);
void log_orphan_del(struct sb *sb, unsigned version, tuxkey_t inum);
void log_freeblocks(struct sb *sb, block_t freeblocks);
@@ -995,6 +1049,7 @@ void replay_iput_orphan_inodes(struct sb *sb,
struct list_head *orphan_in_otree,
int destroy);
int replay_load_orphan_inodes(struct replay *rp);
+int replay_fsync_orphan(struct replay *rp, unsigned version, inum_t inum);

/* super.c */
struct replay *tux3_init_fs(struct sb *sbi);
@@ -1045,6 +1100,8 @@ static inline void tux3_mark_inode_dirty_sync(struct inode *inode)
__tux3_mark_inode_dirty(inode, I_DIRTY_SYNC);
}

+unsigned tux3_dirty_flags1(struct inode *inode, unsigned delta);
+void tux3_dirty_switch_to_prev(struct inode *inode, unsigned delta);
void tux3_dirty_inode(struct inode *inode, int flags);
void tux3_mark_inode_to_delete(struct inode *inode);
void tux3_iattrdirty(struct inode *inode);
@@ -1058,6 +1115,7 @@ void tux3_mark_inode_orphan(struct tux3_inode *tuxnode);
int tux3_inode_is_orphan(struct tux3_inode *tuxnode);
int tux3_flush_inode_internal(struct inode *inode, unsigned delta, int req_flag);
int tux3_flush_inode(struct inode *inode, unsigned delta, int req_flag);
+int tux3_flush_inodes_list(struct sb *sb, unsigned delta, struct list_head *dirty_inodes);
int tux3_flush_inodes(struct sb *sb, unsigned delta);
int tux3_has_dirty_inodes(struct sb *sb, unsigned delta);
void tux3_clear_dirty_inodes(struct sb *sb, unsigned delta);
diff --git a/fs/tux3/user/libklib/libklib.h b/fs/tux3/user/libklib/libklib.h
index 31daad5..ae9bba6 100644
--- a/fs/tux3/user/libklib/libklib.h
+++ b/fs/tux3/user/libklib/libklib.h
@@ -117,4 +117,7 @@ extern int __build_bug_on_failed;
#define S_IWUGO (S_IWUSR|S_IWGRP|S_IWOTH)
#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)

+struct work_struct { };
+struct workqueue_struct { };
+
#endif /* !LIBKLIB_H */
diff --git a/fs/tux3/user/super.c b/fs/tux3/user/super.c
index e34a1b4..0743551 100644
--- a/fs/tux3/user/super.c
+++ b/fs/tux3/user/super.c
@@ -15,6 +15,15 @@
#define trace trace_off
#endif

+static struct workqueue_struct *create_workqueue(char *name) {
+ static struct workqueue_struct fakework = { };
+ return &fakework;
+}
+
+static void destroy_workqueue(struct workqueue_struct *wq) { }
+
+#define INIT_WORK(work, fn)
+
#include "../super.c"

struct inode *__alloc_inode(struct super_block *sb)
diff --git a/fs/tux3/writeback.c b/fs/tux3/writeback.c
index fc20635..5c6bcf0 100644
--- a/fs/tux3/writeback.c
+++ b/fs/tux3/writeback.c
@@ -102,6 +102,22 @@ static inline unsigned tux3_dirty_flags(struct inode *inode, unsigned delta)
return ret;
}

+unsigned tux3_dirty_flags1(struct inode *inode, unsigned delta)
+{
+ return (tux_inode(inode)->state >> tux3_dirty_shift(delta)) & I_DIRTY;
+}
+
+static inline unsigned tux3_iattrsta_update(unsigned state, unsigned delta);
+void tux3_dirty_switch_to_prev(struct inode *inode, unsigned delta)
+{
+ struct tux3_inode *tuxnode = tux_inode(inode);
+ unsigned state = tuxnode->state;
+
+ state |= tux3_dirty_mask(tux3_dirty_flags(inode, delta) & I_DIRTY, delta - 1);
+ state &= ~tux3_dirty_mask(I_DIRTY, delta);
+ tuxnode->state = tux3_iattrsta_update(state, delta - 1);
+}
+
/* This is hook of __mark_inode_dirty() and called I_DIRTY_PAGES too */
void tux3_dirty_inode(struct inode *inode, int flags)
{
@@ -226,6 +242,8 @@ static void tux3_clear_dirty_inode_nolock(struct inode *inode, unsigned delta,
/* Update state if inode isn't dirty anymore */
if (!(tuxnode->state & ~NON_DIRTY_FLAGS))
inode->i_state &= ~I_DIRTY;
+
+ tux3_inode_clear_flag(TUX3_INODE_FSYNC_BIT, inode); /* ugly */
}

/* Clear dirty flags for delta */
@@ -502,12 +520,31 @@ int tux3_flush_inode(struct inode *inode, unsigned delta, int req_flag)
dirty = tux3_dirty_flags(inode, delta);

if (dirty & (TUX3_DIRTY_BTREE | I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ struct tux3_inode *tuxnode = tux_inode(inode);
+ struct sb *sb = tux_sb(inode->i_sb);
/*
* If there is btree root, adjust present after
* tux3_flush_buffers().
*/
tux3_iattr_adjust_for_btree(inode, &idata);

+ if (fsync_mode(sb)) {
+ if (idata.i_nlink != tuxnode->nlink_base) {
+ /*
+ * FIXME: we redirty inode attributes here so next delta
+ * will flush correct nlinks. This means that an fsync
+ * of the same inode before the next delta will flush
+ * it again even it has not been changed.
+ */
+ tux3_iattrdirty_delta(inode, sb->suspended->delta);
+ tux3_mark_inode_dirty_sync(inode);
+ idata.i_nlink = tuxnode->nlink_base;
+ }
+ if (!idata.i_nlink)
+ log_fsync_orphan(sb, sb->version, tuxnode->inum);
+ } else
+ tuxnode->nlink_base = idata.i_nlink;
+
err = tux3_save_inode(inode, &idata, delta);
if (err && !ret)
ret = err;
@@ -569,10 +606,8 @@ static int inode_inum_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}

-int tux3_flush_inodes(struct sb *sb, unsigned delta)
+int tux3_flush_inodes_list(struct sb *sb, unsigned delta, struct list_head *dirty_inodes)
{
- struct sb_delta_dirty *s_ddc = tux3_sb_ddc(sb, delta);
- struct list_head *dirty_inodes = &s_ddc->dirty_inodes;
struct inode_delta_dirty *i_ddc, *safe;
inum_t private;
int err;
@@ -612,6 +647,12 @@ error:
return err;
}

+int tux3_flush_inodes(struct sb *sb, unsigned delta)
+{
+ struct sb_delta_dirty *s_ddc = tux3_sb_ddc(sb, delta);
+ return tux3_flush_inodes_list(sb, delta, &s_ddc->dirty_inodes);
+}
+
int tux3_has_dirty_inodes(struct sb *sb, unsigned delta)
{
struct sb_delta_dirty *s_ddc = tux3_sb_ddc(sb, delta);
@@ -663,3 +704,4 @@ unsigned tux3_check_tuxinode_state(struct inode *inode)
{
return tux_inode(inode)->state & ~NON_DIRTY_FLAGS;
}
+
diff --git a/fs/tux3/writeback_iattrfork.c b/fs/tux3/writeback_iattrfork.c
index 658c012..c50a8c2 100644
--- a/fs/tux3/writeback_iattrfork.c
+++ b/fs/tux3/writeback_iattrfork.c
@@ -54,10 +54,9 @@ static void idata_copy(struct inode *inode, struct tux3_iattr_data *idata)
*
* FIXME: this is better to call tux3_mark_inode_dirty() too?
*/
-void tux3_iattrdirty(struct inode *inode)
+void tux3_iattrdirty_delta(struct inode *inode, unsigned delta)
{
struct tux3_inode *tuxnode = tux_inode(inode);
- unsigned delta = tux3_inode_delta(inode);
unsigned state = tuxnode->state;

/* If dirtied on this delta, nothing to do */
@@ -107,6 +106,11 @@ void tux3_iattrdirty(struct inode *inode)
spin_unlock(&tuxnode->lock);
}

+void tux3_iattrdirty(struct inode *inode)
+{
+ tux3_iattrdirty_delta(inode, tux3_inode_delta(inode));
+}
+
/* Caller must hold tuxnode->lock */
static void tux3_iattr_clear_dirty(struct tux3_inode *tuxnode)
{
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/