[patch 3/4] [jbd] Add support for journal guided resync.
From: scjody
Date: Thu Oct 01 2009 - 18:41:14 EST
Adds support for declare blocks, used by ext3's journal guided resync (declared
mode.) A declare block is added to the journal to list blocks to be written
during the current transaction. During journal replay, we perform a RAID
resync of only these blocks and skip the rest of the resync.
Index: linux-2.6.18-128.1.6/fs/jbd/checkpoint.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/checkpoint.c
+++ linux-2.6.18-128.1.6/fs/jbd/checkpoint.c
@@ -712,6 +712,8 @@ void __journal_drop_transaction(journal_
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
+ J_ASSERT(transaction->t_declare_root.rnode == NULL);
+ J_ASSERT(transaction->t_declare_done_root.rnode == NULL);
J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
Index: linux-2.6.18-128.1.6/fs/jbd/commit.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/commit.c
+++ linux-2.6.18-128.1.6/fs/jbd/commit.c
@@ -373,6 +373,262 @@ static inline __u32 jbd_checksum_data(__
return checksum;
}
+int wait_for_descriptors(journal_t *journal, transaction_t *trans) {
+ struct journal_head *jh;
+ struct buffer_head *bh;
+ int err = 0;
+
+wait_for_ctlbuf:
+
+ while (trans->t_log_list != NULL) {
+
+ jh = trans->t_log_list->b_tprev;
+ bh = jh2bh(jh);
+ if (buffer_locked(bh)) {
+ wait_on_buffer(bh);
+ goto wait_for_ctlbuf;
+ }
+ if (cond_resched())
+ goto wait_for_ctlbuf;
+
+ if (unlikely(!buffer_uptodate(bh)))
+ err = -EIO;
+
+ BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+ clear_buffer_jwrite(bh);
+ journal_unfile_buffer(journal, jh);
+ journal_put_journal_head(jh);
+ __brelse(bh); /* One for getblk */
+ }
+
+ return err;
+}
+
+struct journal_head *get_descriptor(journal_t *journal, transaction_t *trans,
+ int blocktype, char **tagp, int *space_left) {
+ struct journal_head *descriptor;
+ struct buffer_head *dbh;
+ journal_header_t *header;
+
+ jbd_debug(4, "JBD: get descriptor\n");
+
+ descriptor = journal_get_descriptor_buffer(journal);
+ if (!descriptor)
+ return NULL;
+
+ dbh = jh2bh(descriptor);
+ jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+ (unsigned long long)dbh->b_blocknr, dbh->b_data);
+ header = (journal_header_t *)&dbh->b_data[0];
+ header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+ header->h_blocktype = cpu_to_be32(blocktype);
+ header->h_sequence = cpu_to_be32(trans->t_tid);
+
+ *tagp = &dbh->b_data[sizeof(journal_header_t)];
+ *space_left = dbh->b_size - sizeof(journal_header_t);
+
+ set_buffer_jwrite(dbh);
+ set_buffer_dirty(dbh);
+
+ /* Record it so that we can wait for it later */
+ BUFFER_TRACE(dbh, "ph3: file as descriptor");
+ journal_file_buffer(descriptor, trans, BJ_LogCtl);
+
+ return descriptor;
+}
+
+/*
+ * Write declare blocks containing a list of the data blocks that will be
+ * written out
+ */
+void write_declare_blocks(journal_t *journal, transaction_t *transaction,
+ int committing)
+{
+ struct journal_head *jh, *descriptor = NULL;
+ struct buffer_head *bh;
+ int i, bufs = 0, err;
+ unsigned int n, count = 0, to_write;
+ unsigned long nextblock = 0;
+ char *tagp = NULL;
+ journal_block_tag_t *tag = NULL;
+ int space_left = 0, first_tag = 0, tag_flag;
+ struct radix_tree_root *root;
+
+ root = &transaction->t_declare_root;
+
+ spin_lock(&journal->j_list_lock);
+ to_write = transaction->t_declare_request;
+ transaction->t_declare_request = 0;
+ spin_unlock(&journal->j_list_lock);
+
+ if (to_write == UINT_MAX)
+ jbd_debug (1, "jbd: tid %d write declare request for ALL "
+ "blocks\n", transaction->t_tid);
+ else
+ jbd_debug (1, "jbd: tid %d write declare request for %u "
+ "blocks\n", transaction->t_tid, to_write);
+write_declare:
+ cond_resched();
+ spin_lock(&journal->j_list_lock);
+
+ n = radix_tree_gang_lookup(root, journal->j_declare_jhs, nextblock, 1);
+ while (n) {
+ if (!descriptor) {
+ J_ASSERT(bufs == 0);
+
+ spin_unlock(&journal->j_list_lock);
+
+ descriptor = get_descriptor(journal, transaction,
+ JFS_DECLARE_BLOCK,
+ &tagp, &space_left);
+
+ if (!descriptor) {
+ journal_abort(journal, -EIO);
+ return;
+ }
+
+ first_tag = 1;
+ journal->j_declare_bhs[bufs++] = jh2bh(descriptor);
+
+ goto write_declare;
+ }
+
+ jh = (struct journal_head *)journal->j_declare_jhs[0];
+ bh = jh2bh(jh);
+
+ /* refile the buffer as having been declared */
+ if (!inverted_lock(journal, bh))
+ goto write_declare;
+ __journal_unfile_buffer(jh);
+ __journal_file_buffer(jh, transaction, BJ_DeclareDone);
+
+ jbd_unlock_bh_state(bh);
+
+ /* record the block's tag in the current descriptor buffer */
+ tag_flag = 0;
+ if (!first_tag)
+ tag_flag |= JFS_FLAG_SAME_UUID;
+
+ tag = (journal_block_tag_t *)tagp;
+ tag->t_blocknr = cpu_to_be32(bh->b_blocknr);
+ tag->t_flags = cpu_to_be32(tag_flag);
+ tagp += sizeof(journal_block_tag_t);
+ space_left -= sizeof(journal_block_tag_t);
+
+ if (first_tag) {
+ memcpy (tagp, journal->j_uuid, 16);
+ tagp += 16;
+ space_left -= 16;
+ first_tag = 0;
+ }
+
+ count++;
+
+ /* advance to the next journal head and buffer */
+ nextblock = bh->b_blocknr + 1;
+ n = radix_tree_gang_lookup(root, journal->j_declare_jhs,
+ nextblock, 1);
+
+ /* If there's no more to do, or if the descriptor is full,
+ let the IO rip! */
+
+ if (bufs == ARRAY_SIZE(journal->j_declare_bhs) || n == 0 ||
+ count == to_write ||
+ space_left < sizeof(journal_block_tag_t) + 16) {
+
+ jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+
+ /* Write an end-of-descriptor marker before
+ * submitting the IOs. "tag" still points to
+ * the last tag we set up.
+ */
+
+ tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
+
+ spin_unlock(&journal->j_list_lock);
+
+ for (i = 0; i < bufs; i++) {
+ struct buffer_head *bh = journal->j_declare_bhs[i];
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ bh->b_end_io = journal_end_buffer_io_sync;
+ submit_bh(WRITE, bh);
+ }
+
+ cond_resched();
+ spin_lock(&journal->j_list_lock);
+
+ /* force a new descriptor to be generated next time */
+ descriptor = NULL;
+ bufs = 0;
+
+ /* need to redo tree lookup since we lost the lock,
+ but that will happen after we get a new descriptor */
+ }
+
+ if (count == to_write) break;
+ }
+ spin_unlock(&journal->j_list_lock);
+
+ jbd_debug(2, "jbd: tid %d wrote declarations for %u blocks\n",
+ transaction->t_tid, count);
+ if (to_write == UINT_MAX)
+ J_ASSERT(transaction->t_declare_root.rnode == NULL);
+
+ /* wait for the declare blocks to be written */
+ err = wait_for_descriptors(journal, transaction);
+
+ /* move the declared buffers to the sync data list */
+
+ root = &transaction->t_declare_done_root;
+ count = 0;
+ nextblock = 0;
+
+move_declare:
+ cond_resched();
+ spin_lock(&journal->j_list_lock);
+
+ while ((n = radix_tree_gang_lookup(root, journal->j_declare_jhs,
+ nextblock,
+ ARRAY_SIZE(journal->j_declare_jhs)))) {
+ /* loop and move the journal heads */
+ for (i = 0; i < n; i++) {
+ jh = journal->j_declare_jhs[i];
+ bh = jh2bh(jh);
+
+ if (!inverted_lock(journal, bh)) {
+ goto move_declare;
+ }
+ __journal_unfile_buffer(jh);
+
+ if (committing)
+ /* set buffer dirty for writing below */
+ set_buffer_dirty(bh);
+ else
+ /* set page dirty for virtual memory */
+ mark_buffer_dirty(bh);
+
+ __journal_file_buffer(jh, transaction, BJ_SyncData);
+
+ count++;
+
+ nextblock = bh->b_blocknr + 1;
+
+ jbd_unlock_bh_state(bh);
+
+ if (lock_need_resched(&journal->j_list_lock)) {
+ spin_unlock(&journal->j_list_lock);
+ goto move_declare;
+ }
+ }
+ }
+ spin_unlock(&journal->j_list_lock);
+
+ jbd_debug(2, "jbd: tid %d moved %u declare blocks\n",
+ transaction->t_tid, count);
+}
+
/*
* journal_commit_transaction
*
@@ -390,7 +646,6 @@ void journal_commit_transaction(journal_
int err;
unsigned long blocknr;
char *tagp = NULL;
- journal_header_t *header;
journal_block_tag_t *tag = NULL;
int space_left = 0;
int first_tag = 0;
@@ -517,6 +772,11 @@ void journal_commit_transaction(journal_
jbd_debug (3, "JBD: commit phase 2\n");
+ if (journal->j_flags & JFS_DECLARE) {
+ commit_transaction->t_declare_request = UINT_MAX;
+ write_declare_blocks(journal, commit_transaction, 1);
+ }
+
/*
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
@@ -545,9 +805,13 @@ void journal_commit_transaction(journal_
* If we found any dirty or locked buffers, then we should have
* looped back up to the write_out_data label. If there weren't
* any then journal_clean_data_list should have wiped the list
- * clean by now, so check that it is in fact empty.
+ * clean by now, so check that it is in fact empty. Also check
+ * declared mode trees - write_declare_blocks() should have left
+ * them empty.
*/
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(commit_transaction->t_declare_root.rnode == NULL);
+ J_ASSERT(commit_transaction->t_declare_done_root.rnode == NULL);
jbd_debug (3, "JBD: commit phase 3\n");
@@ -596,38 +860,20 @@ void journal_commit_transaction(journal_
record the metadata buffer. */
if (!descriptor) {
- struct buffer_head *bh;
-
J_ASSERT (bufs == 0);
- jbd_debug(4, "JBD: get descriptor\n");
+ descriptor = get_descriptor(journal,
+ commit_transaction,
+ JFS_DESCRIPTOR_BLOCK,
+ &tagp, &space_left);
- descriptor = journal_get_descriptor_buffer(journal);
if (!descriptor) {
journal_abort(journal, -EIO);
continue;
}
- bh = jh2bh(descriptor);
- jbd_debug(4, "JBD: got buffer %llu (%p)\n",
- (unsigned long long)bh->b_blocknr, bh->b_data);
- header = (journal_header_t *)&bh->b_data[0];
- header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
- header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
- header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-
- tagp = &bh->b_data[sizeof(journal_header_t)];
- space_left = bh->b_size - sizeof(journal_header_t);
first_tag = 1;
- set_buffer_jwrite(bh);
- set_buffer_dirty(bh);
- wbuf[bufs++] = bh;
-
- /* Record it so that we can wait for IO
- completion later */
- BUFFER_TRACE(bh, "ph3: file as descriptor");
- journal_file_buffer(descriptor, commit_transaction,
- BJ_LogCtl);
+ wbuf[bufs++] = jh2bh(descriptor);
}
/* Where is the buffer to be written? */
@@ -826,29 +1072,7 @@ wait_for_iobuf:
jbd_debug(3, "JBD: commit phase 5\n");
/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
- while (commit_transaction->t_log_list != NULL) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_log_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_ctlbuf;
- }
- if (cond_resched())
- goto wait_for_ctlbuf;
-
- if (unlikely(!buffer_uptodate(bh)))
- err = -EIO;
-
- BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
- clear_buffer_jwrite(bh);
- journal_unfile_buffer(journal, jh);
- journal_put_journal_head(jh);
- __brelse(bh); /* One for getblk */
- /* AKPM: bforget here */
- }
+ err = wait_for_descriptors(journal, commit_transaction);
if (err)
journal_abort(journal, err);
@@ -904,6 +1128,8 @@ wait_for_iobuf:
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
J_ASSERT(commit_transaction->t_shadow_list == NULL);
J_ASSERT(commit_transaction->t_log_list == NULL);
+ J_ASSERT(commit_transaction->t_declare_root.rnode == NULL);
+ J_ASSERT(commit_transaction->t_declare_done_root.rnode == NULL);
restart_loop:
/*
Index: linux-2.6.18-128.1.6/fs/jbd/journal.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/journal.c
+++ linux-2.6.18-128.1.6/fs/jbd/journal.c
@@ -86,6 +86,10 @@ EXPORT_SYMBOL(journal_invalidatepage);
EXPORT_SYMBOL(journal_try_to_free_buffers);
EXPORT_SYMBOL(journal_bmap);
EXPORT_SYMBOL(journal_force_commit);
+EXPORT_SYMBOL(journal_write_declare);
+
+extern void write_declare_blocks(journal_t *journal,
+ transaction_t *commit_transaction, int committing);
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
@@ -156,6 +160,16 @@ loop:
journal_commit_transaction(journal);
spin_lock(&journal->j_state_lock);
goto loop;
+ } else if (journal->j_flags & JFS_DECLARE &&
+ (transaction = journal->j_running_transaction) &&
+ transaction->t_declare_request) {
+ jbd_debug(2, "early declare\n");
+ spin_unlock(&journal->j_state_lock);
+ write_declare_blocks(journal, transaction, 0);
+ spin_lock(&journal->j_state_lock);
+
+ wake_up(&journal->j_wait_declare);
+ goto loop;
}
wake_up(&journal->j_wait_done_commit);
@@ -494,6 +508,38 @@ int journal_force_commit_nested(journal_
}
/*
+ * For ext3_fsync: start a request to declare the file's data and wait
+ * for the declarations to complete.
+ */
+int journal_write_declare(journal_t *journal)
+{
+ transaction_t *transaction = journal->j_running_transaction;
+ DEFINE_WAIT(wait);
+
+ if (transaction == NULL)
+ return 0;
+
+ spin_lock(&journal->j_list_lock);
+
+ if (transaction->t_declare_root.rnode == NULL) {
+ spin_unlock(&journal->j_list_lock);
+ return 0;
+ }
+
+ transaction->t_declare_request = UINT_MAX;
+
+ jbd_debug(1, "waking commit thread for fsync declare\n");
+ wake_up(&journal->j_wait_commit);
+
+ prepare_to_wait(&journal->j_wait_declare, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock(&journal->j_list_lock);
+ schedule();
+ finish_wait(&journal->j_wait_declare, &wait);
+
+ return 0;
+}
+
+/*
* Start a commit of the current running transaction (if any). Returns true
* if a transaction was started, and fills its tid in at *ptid
*/
@@ -959,6 +1005,7 @@ static journal_t * journal_init_common (
init_waitqueue_head(&journal->j_wait_checkpoint);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
+ init_waitqueue_head(&journal->j_wait_declare);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
@@ -1292,6 +1339,8 @@ static int journal_get_superblock(journa
J_ASSERT(bh != NULL);
if (!buffer_uptodate(bh)) {
+ /* TODO: resync the superblock */
+
ll_rw_block(READ, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
Index: linux-2.6.18-128.1.6/fs/jbd/recovery.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/recovery.c
+++ linux-2.6.18-128.1.6/fs/jbd/recovery.c
@@ -22,6 +22,7 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/crc32.h>
+#include <linux/raid/md.h>
#endif
/*
@@ -36,6 +37,9 @@ struct recovery_info
int nr_replays;
int nr_revokes;
int nr_revoke_hits;
+ int nr_declared;
+
+ int resync_errors;
};
enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
@@ -43,6 +47,7 @@ static int do_one_pass(journal_t *journa
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
tid_t, struct recovery_info *);
+static int journal_syncraid(journal_t *, unsigned long);
#ifdef __KERNEL__
@@ -53,6 +58,37 @@ void journal_brelse_array(struct buffer_
brelse (b[n]);
}
+static int resync_range(journal_t *j, unsigned long start,
+ unsigned long end)
+{
+ int err;
+ struct inode *fake_inode = kmalloc(sizeof(*fake_inode), GFP_KERNEL);
+ mdu_range_t range;
+ sector_t sectors_per_block = j->j_blocksize >> 9;
+ mm_segment_t old_fs;
+
+ if (fake_inode == NULL) {
+ printk(KERN_ERR "JBD: Out of memory during recovery.\n");
+ return -ENOMEM;
+ }
+
+ fake_inode->i_bdev = j->j_fs_dev;
+ range.start = start * sectors_per_block;
+ range.end = end * sectors_per_block + sectors_per_block - 1;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = blkdev_driver_ioctl(fake_inode, NULL, j->j_fs_dev->bd_disk,
+ RESYNC_RANGE, (long)&range);
+ set_fs(old_fs);
+
+ jbd_debug(3, "RESYNC_RANGE of sectors %llu - %llu returned %d\n",
+ range.start, range.end, err);
+
+ kfree(fake_inode);
+
+ return err;
+}
/*
* When reading from the journal, we are going through the block device
@@ -67,7 +103,7 @@ void journal_brelse_array(struct buffer_
*/
#define MAXBUF 8
-static int do_readahead(journal_t *journal, unsigned int start)
+static int do_readahead(journal_t *journal, unsigned int start, int raid_sync)
{
int err;
unsigned int max, nbufs, next;
@@ -95,6 +131,14 @@ static int do_readahead(journal_t *journ
goto failed;
}
+ /* For declared mode: perform a raid synchronization for the
+ * journal blocks; this will resync all of the journal blocks
+ * read, which is more than strictly necessary.
+ */
+
+ if (raid_sync)
+ resync_range(journal, blocknr, blocknr);
+
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
err = -ENOMEM;
@@ -103,6 +147,7 @@ static int do_readahead(journal_t *journ
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
+
if (nbufs == MAXBUF) {
ll_rw_block(READ, nbufs, bufs);
journal_brelse_array(bufs, nbufs);
@@ -130,7 +175,7 @@ failed:
*/
static int jread(struct buffer_head **bhp, journal_t *journal,
- unsigned int offset)
+ unsigned int offset, int sync_raid)
{
int err;
unsigned long blocknr;
@@ -159,7 +204,7 @@ static int jread(struct buffer_head **bh
/* If this is a brand new buffer, start readahead.
Otherwise, we assume we are already reading it. */
if (!buffer_req(bh))
- do_readahead(journal, offset);
+ do_readahead(journal, offset, sync_raid);
wait_on_buffer(bh);
}
@@ -257,6 +302,30 @@ int journal_recover(journal_t *journal)
jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+ if (!err && !info.resync_errors && JFS_HAS_INCOMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)) {
+ /* Successful declared mode resync: instruct the block device
+ * to skip its resync */
+ struct inode *fake_inode;
+
+ jbd_debug(0, "JBD: Resynced %d declared blocks\n",
+ info.nr_declared);
+
+ fake_inode = kmalloc(sizeof(*fake_inode), GFP_KERNEL);
+ if (fake_inode) {
+ fake_inode->i_bdev = journal->j_fs_dev;
+ jbd_debug(1, "Sending SKIP_RESYNC ioctl\n");
+
+ blkdev_driver_ioctl(fake_inode, NULL,
+ journal->j_fs_dev->bd_disk,
+ SKIP_RESYNC, 0);
+ }
+ kfree(fake_inode);
+ }
+
+ journal_clear_features(journal, 0, 0,
+ JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS);
+
/* Restart the log at the next transaction ID, thus invalidating
* any existing commit records in the log. */
journal->j_transaction_sequence = ++info.end_transaction;
@@ -329,7 +398,7 @@ static int calc_chksums(journal_t *journ
for (i = 0; i < num_blks; i++) {
io_block = (*next_log_block)++;
wrap(journal, *next_log_block);
- err = jread(&obh, journal, io_block);
+ err = jread(&obh, journal, io_block, 0);
if (err) {
printk(KERN_ERR "JBD: IO error %d recovering block "
"%lu in log\n", err, io_block);
@@ -355,6 +424,7 @@ static int do_one_pass(journal_t *journa
unsigned int sequence;
int blocktype;
__u32 crc32_sum = ~0; /* Transactional Checksums */
+ int raid_sync_journal = 0, raid_sync_data = 0;
/* Precompute the maximum metadata descriptors in a descriptor block */
int MAX_BLOCKS_PER_DESC;
@@ -397,9 +467,30 @@ static int do_one_pass(journal_t *journa
* check right now that we haven't gone past the end of
* the log. */
- if (pass != PASS_SCAN)
- if (tid_geq(next_commit_ID, info->end_transaction))
- break;
+ if (pass != PASS_SCAN) {
+ if (tid_geq(next_commit_ID, info->end_transaction)) {
+ /* For declared mode resync, move ahead past
+ * the last commmitted transaction to deal with
+ * raid sync for declare blocks and the head
+ * of the journal.
+ */
+ if (pass == PASS_REPLAY &&
+ JFS_HAS_INCOMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)) {
+ if (journal->j_fs_dev == journal->j_dev)
+ raid_sync_journal = 1;
+ if (!raid_sync_data)
+ jbd_debug(1, "Declared mode was used; "
+ "performing raid sync %s\n",
+ raid_sync_journal ?
+ "of journal and data" :
+ "of data");
+ raid_sync_data = 1;
+ }
+ else
+ break;
+ }
+ }
jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
next_commit_ID, next_log_block, journal->j_last);
@@ -409,7 +500,7 @@ static int do_one_pass(journal_t *journa
* record. */
jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
- err = jread(&bh, journal, next_log_block);
+ err = jread(&bh, journal, next_log_block, raid_sync_journal);
if (err)
goto failed;
@@ -426,6 +517,12 @@ static int do_one_pass(journal_t *journa
if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
brelse(bh);
+
+ /* raid sync the head of the journal */
+ if (raid_sync_journal) {
+ if (journal_syncraid(journal, next_log_block))
+ info->resync_errors++;
+ }
break;
}
@@ -436,6 +533,12 @@ static int do_one_pass(journal_t *journa
if (sequence != next_commit_ID) {
brelse(bh);
+
+ /* raid sync the head of the journal */
+ if (raid_sync_journal) {
+ if (journal_syncraid(journal, next_log_block))
+ info->resync_errors++;
+ }
break;
}
@@ -485,7 +588,8 @@ static int do_one_pass(journal_t *journa
io_block = next_log_block++;
wrap(journal, next_log_block);
- err = jread(&obh, journal, io_block);
+ err = jread(&obh, journal, io_block,
+ raid_sync_journal);
if (err) {
/* Recover what we can, but
* report failure at the end. */
@@ -668,6 +772,42 @@ static int do_one_pass(journal_t *journa
goto failed;
continue;
+ case JFS_DECLARE_BLOCK:
+ if (!raid_sync_data) {
+ brelse(bh);
+ continue;
+ }
+
+ /* this is a declare block for an uncommitted
+ * transaction, so raid sync all of the blocks it
+ * describes
+ */
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+ while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
+ <= journal->j_blocksize) {
+
+ unsigned long blocknr;
+
+ tag = (journal_block_tag_t *) tagp;
+ flags = be32_to_cpu(tag->t_flags);
+ blocknr = be32_to_cpu(tag->t_blocknr);
+
+ if (resync_range(journal, blocknr, blocknr))
+ ++info->resync_errors;
+ ++info->nr_declared;
+
+ tagp += sizeof(journal_block_tag_t);
+ if (!(flags & JFS_FLAG_SAME_UUID))
+ tagp += 16;
+
+ if (flags & JFS_FLAG_LAST_TAG)
+ break;
+ }
+
+ brelse(bh);
+ continue;
+
default:
jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
@@ -705,6 +845,38 @@ static int do_one_pass(journal_t *journa
return err;
}
+/* RAID sync the next one quarter of the journal. This is called once at the
+ * end of recovery if declare blocks are present since that part of the journal
+ * was likely undergoing writes before the crash.
+ */
+static int
+journal_syncraid(journal_t *journal, unsigned long next_log_block)
+{
+ int i, err;
+ unsigned long blocknr;
+
+ jbd_debug(2, "RAID resync of 1/4 of the journal starting at %lu\n",
+ next_log_block);
+
+ for (i = 0; i < journal->j_maxlen / 4; i++) {
+ err = journal_bmap(journal, next_log_block, &blocknr);
+
+ if (err) {
+ printk(KERN_ERR "JBD: bad block at offset %lu\n",
+ next_log_block);
+ return err;
+ }
+
+ err = resync_range(journal, blocknr, blocknr);
+ if (err)
+ return err;
+
+ next_log_block++;
+ wrap(journal, next_log_block);
+ }
+
+ return 0;
+}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
Index: linux-2.6.18-128.1.6/fs/jbd/transaction.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/transaction.c
+++ linux-2.6.18-128.1.6/fs/jbd/transaction.c
@@ -58,6 +58,10 @@ get_transaction(journal_t *journal, tran
journal->j_commit_timer.expires = transaction->t_expires;
add_timer(&journal->j_commit_timer);
+ /* Initialize the declare radix tree */
+ INIT_RADIX_TREE(&transaction->t_declare_root, GFP_ATOMIC);
+ INIT_RADIX_TREE(&transaction->t_declare_done_root, GFP_ATOMIC);
+
J_ASSERT(journal->j_running_transaction == NULL);
journal->j_running_transaction = transaction;
transaction->t_max_wait = 0;
@@ -956,6 +960,7 @@ int journal_dirty_data(handle_t *handle,
journal_t *journal = handle->h_transaction->t_journal;
int need_brelse = 0;
struct journal_head *jh;
+ int jdatalist;
if (is_handle_aborted(handle))
return 0;
@@ -999,6 +1004,8 @@ int journal_dirty_data(handle_t *handle,
goto no_journal;
}
+ jdatalist = journal->j_flags & JFS_DECLARE ? BJ_Declare : BJ_SyncData;
+
if (jh->b_transaction) {
JBUFFER_TRACE(jh, "has transaction");
if (jh->b_transaction != handle->h_transaction) {
@@ -1041,6 +1048,8 @@ int journal_dirty_data(handle_t *handle,
*/
if (jh->b_jlist != BJ_None &&
jh->b_jlist != BJ_SyncData &&
+ jh->b_jlist != BJ_Declare &&
+ jh->b_jlist != BJ_DeclareDone &&
jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "Not stealing");
goto no_journal;
@@ -1088,18 +1097,19 @@ int journal_dirty_data(handle_t *handle,
* committing transaction, so might still be left on that
* transaction's metadata lists.
*/
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
+ if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Declare &&
+ jh->b_jlist != BJ_DeclareDone && jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "not on correct data list: unfile");
J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
__journal_temp_unlink_buffer(jh);
jh->b_transaction = handle->h_transaction;
JBUFFER_TRACE(jh, "file as data");
__journal_file_buffer(jh, handle->h_transaction,
- BJ_SyncData);
+ jdatalist);
}
} else {
JBUFFER_TRACE(jh, "not on a transaction");
- __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
+ __journal_file_buffer(jh, handle->h_transaction, jdatalist);
}
no_journal:
spin_unlock(&journal->j_list_lock);
@@ -1578,6 +1588,7 @@ void __journal_temp_unlink_buffer(struct
struct journal_head **list = NULL;
transaction_t *transaction;
struct buffer_head *bh = jh2bh(jh);
+ struct radix_tree_root *root = NULL;
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
transaction = jh->b_transaction;
@@ -1617,9 +1628,25 @@ void __journal_temp_unlink_buffer(struct
case BJ_Locked:
list = &transaction->t_locked_list;
break;
+ case BJ_Declare:
+ root = &transaction->t_declare_root;
+ transaction->t_declare_count--;
+ break;
+ case BJ_DeclareDone:
+ root = &transaction->t_declare_done_root;
+ break;
+ }
+
+ if (jh->b_jlist == BJ_Declare || jh->b_jlist == BJ_DeclareDone) {
+ if ((radix_tree_delete(root, bh->b_blocknr)) != jh) {
+ printk(KERN_ERR
+ "jbd: ERROR radix tree delete block %8llu\n",
+ (unsigned long long)bh->b_blocknr);
+ }
}
+ else
+ __blist_del_buffer(list, jh);
- __blist_del_buffer(list, jh);
jh->b_jlist = BJ_None;
if (test_clear_buffer_jbddirty(bh))
mark_buffer_dirty(bh); /* Expose it to the VM */
@@ -1660,7 +1687,8 @@ __journal_try_to_free_buffer(journal_t *
spin_lock(&journal->j_list_lock);
if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
+ if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Declare ||
+ jh->b_jlist == BJ_DeclareDone || jh->b_jlist == BJ_Locked) {
/* A written-back ordered data buffer */
JBUFFER_TRACE(jh, "release data");
__journal_unfile_buffer(jh);
@@ -2072,6 +2100,8 @@ void __journal_file_buffer(struct journa
struct journal_head **list = NULL;
int was_dirty = 0;
struct buffer_head *bh = jh2bh(jh);
+ struct radix_tree_root *root = NULL;
+ int declare_per_block;
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -2126,15 +2156,44 @@ void __journal_file_buffer(struct journa
list = &transaction->t_reserved_list;
break;
case BJ_Locked:
- list = &transaction->t_locked_list;
+ list = &transaction->t_locked_list;
+ break;
+ case BJ_Declare:
+ root = &transaction->t_declare_root;
+ transaction->t_declare_count++;
break;
+ case BJ_DeclareDone:
+ root = &transaction->t_declare_done_root;
+ break;
+ }
+
+ if (jlist == BJ_Declare || jlist == BJ_DeclareDone) {
+ if ((radix_tree_insert(root, bh->b_blocknr, jh)) != 0) {
+ printk(KERN_ERR
+ "jbd: ERROR radix tree insert block %8lu\n",
+ (long unsigned)bh->b_blocknr);
+ }
+ } else {
+ __blist_add_buffer(list, jh);
}
- __blist_add_buffer(list, jh);
jh->b_jlist = jlist;
if (was_dirty)
set_buffer_jbddirty(bh);
+
+ declare_per_block = (bh->b_size - (sizeof(journal_header_t) + 32)) /
+ sizeof(journal_block_tag_t);
+
+ /* wake up the commit thread to perform early declarations */
+ assert_spin_locked(&transaction->t_journal->j_list_lock);
+ if (transaction->t_journal->j_flags & JFS_DECLARE &&
+ jlist == BJ_Declare &&
+ transaction->t_declare_count >= declare_per_block) {
+ transaction->t_declare_request = transaction->t_declare_count /
+ declare_per_block * declare_per_block;
+ wake_up(&transaction->t_journal->j_wait_commit);
+ }
}
void journal_file_buffer(struct journal_head *jh,
Index: linux-2.6.18-128.1.6/include/linux/jbd.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/jbd.h
+++ linux-2.6.18-128.1.6/include/linux/jbd.h
@@ -26,6 +26,7 @@
#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/journal-head.h>
+#include <linux/radix-tree.h>
#include <linux/stddef.h>
#include <linux/bit_spinlock.h>
#include <linux/mutex.h>
@@ -137,6 +138,7 @@ typedef struct journal_s journal_t; /* J
#define JFS_SUPERBLOCK_V1 3
#define JFS_SUPERBLOCK_V2 4
#define JFS_REVOKE_BLOCK 5
+#define JFS_DECLARE_BLOCK 6
/*
* Standard header for all descriptor blocks:
@@ -261,12 +263,14 @@ typedef struct journal_superblock_s
#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
+#define JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS 0x00000008
/* Features known to this kernel version: */
#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM
#define JFS_KNOWN_ROCOMPAT_FEATURES 0
#define JFS_KNOWN_INCOMPAT_FEATURES (JFS_FEATURE_INCOMPAT_REVOKE | \
- JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT | \
+ JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)
#ifdef __KERNEL__
@@ -559,6 +563,15 @@ struct transaction_s
struct journal_head *t_sync_datalist;
/*
+ * Radix tree of all data buffers that must be declared before being
+ * written, declare mode counters [j_list_lock]
+ */
+ struct radix_tree_root t_declare_root;
+ struct radix_tree_root t_declare_done_root;
+ unsigned int t_declare_count;
+ unsigned int t_declare_request;
+
+ /*
* Doubly-linked circular list of all forget buffers (superseded
* buffers which we can un-checkpoint once this transaction commits)
* [j_list_lock]
@@ -730,6 +743,7 @@ jbd_time_diff(unsigned int start, unsign
* @j_wait_checkpoint: Wait queue to trigger checkpointing
* @j_wait_commit: Wait queue to trigger commit
* @j_wait_updates: Wait queue to wait for updates to complete
+ * @j_wait_declare: Wait queue to wait for declarations to complete
* @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints
* @j_head: Journal head - identifies the first unused block in the journal
* @j_tail: Journal tail - identifies the oldest still-used block in the
@@ -768,6 +782,8 @@ jbd_time_diff(unsigned int start, unsign
* @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
* number that will fit in j_blocksize
* @j_last_sync_writer: most recent pid which did a synchronous write
+ * @j_declare_jhs: array of journal_heads for write_declare_blocks
+ * @j_declare_bhs: array of buffer_heads for write_declare_blocks
* @j_private: An opaque pointer to fs-private information.
*/
@@ -841,6 +857,9 @@ struct journal_s
/* Wait queue to wait for updates to complete */
wait_queue_head_t j_wait_updates;
+ /* Wait queue to wait for declarations to complete */
+ wait_queue_head_t j_wait_declare;
+
/* Semaphore for locking against concurrent checkpoints */
struct mutex j_checkpoint_mutex;
@@ -970,6 +989,13 @@ struct journal_s
struct transaction_stats_s j_stats;
/*
+ * Arrays of jhs and bhs for write_declare_blocks, to avoid
+ * having to allocate them each time.
+ */
+ void *j_declare_jhs[64];
+ struct buffer_head *j_declare_bhs[64];
+
+ /*
* An opaque pointer to fs-private information. ext3 puts its
* superblock pointer here
*/
@@ -985,6 +1011,7 @@ struct journal_s
#define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */
#define JFS_LOADED 0x010 /* The journal superblock has been loaded */
#define JFS_BARRIER 0x020 /* Use IDE barriers */
+#define JFS_DECLARE 0x040 /* Declare data blocks before writing */
/*
* Function declarations for the journaling transaction and buffer
@@ -1100,6 +1127,7 @@ extern void journal_ack_err (journ
extern int journal_clear_err (journal_t *);
extern int journal_bmap(journal_t *, unsigned long, unsigned long *);
extern int journal_force_commit(journal_t *);
+extern int journal_write_declare(journal_t *);
/*
* journal_head management
@@ -1244,7 +1272,9 @@ static inline int jbd_space_needed(journ
#define BJ_LogCtl 6 /* Buffer contains log descriptors */
#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
#define BJ_Locked 8 /* Locked for I/O during commit */
-#define BJ_Types 9
+#define BJ_Declare 9 /* Needs to be declared first */
+#define BJ_DeclareDone 10 /* Has been declared */
+#define BJ_Types 11
extern int jbd_blocks_per_page(struct inode *inode);
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/