Re: [PATCH 4/4] jbd: fix error handling for checkpoint io (rebased)

From: Josef Bacik
Date: Wed May 14 2008 - 09:33:04 EST


On Wed, May 14, 2008 at 01:50:43PM +0900, Hidehiro Kawai wrote:
> Subject: [PATCH 4/4] jbd: fix error handling for checkpoint io
>
> When a checkpointing IO fails, current JBD code doesn't check the
> error and continue journaling. This means latest metadata can be
> lost from both the journal and filesystem.
>
> This patch leaves the failed metadata blocks in the journal space
> and aborts journaling in the case of log_do_checkpoint().
> To achieve this, we need to do:
>
> 1. don't remove the failed buffer from the checkpoint list where in
> the case of __try_to_free_cp_buf() because it may be released or
> overwritten by a later transaction
> 2. log_do_checkpoint() is the last chance, remove the failed buffer
> from the checkpoint list and abort journaling
> 3. when checkpointing fails, don't update the journal super block to
> prevent the journalled contents from being cleaned
> 4. when checkpointing fails, don't clear the needs_recovery flag,
> otherwise the journalled contents are ignored and cleaned in the
> recovery phase
> 5. if the recovery fails, keep the needs_recovery flag
>
> Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@xxxxxxxxxxx>
> ---
> fs/ext3/ioctl.c | 12 ++++----
> fs/ext3/super.c | 13 +++++++--
> fs/jbd/checkpoint.c | 59 ++++++++++++++++++++++++++++++++++--------
> fs/jbd/journal.c | 21 +++++++++-----
> fs/jbd/recovery.c | 7 +++-
> include/linux/jbd.h | 7 ++++
> 6 files changed, 91 insertions(+), 28 deletions(-)
>
> Index: linux-2.6.26-rc2/fs/jbd/checkpoint.c
> ===================================================================
> --- linux-2.6.26-rc2.orig/fs/jbd/checkpoint.c
> +++ linux-2.6.26-rc2/fs/jbd/checkpoint.c
> @@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct j
> int ret = 0;
> struct buffer_head *bh = jh2bh(jh);
>
> - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
> + if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
> + !buffer_dirty(bh) && buffer_uptodate(bh)) {
> JBUFFER_TRACE(jh, "remove from checkpoint list");
> ret = __journal_remove_checkpoint(jh) + 1;
> jbd_unlock_bh_state(bh);
> @@ -140,6 +141,25 @@ void __log_wait_for_space(journal_t *jou
> }
>
> /*
> + * We couldn't write back some metadata buffers to the filesystem.
> + * To avoid unwritten-back metadata buffers from losing, set
> + * JFS_CP_ABORT flag and make sure that the journal space isn't
> + * cleaned. This function also aborts journaling.
> + */
> +static void __journal_abort_checkpoint(journal_t *journal, int errno)
> +{
> + if (is_checkpoint_aborted(journal))
> + return;
> +
> + spin_lock(&journal->j_state_lock);
> + journal->j_flags |= JFS_CP_ABORT;
> + spin_unlock(&journal->j_state_lock);
> + printk(KERN_WARNING "JBD: Checkpointing failed. Some metadata blocks "
> + "are still old.\n");
> + journal_abort(journal, errno);
> +}
> +
> +/*
> * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
> * The caller must restart a list walk. Wait for someone else to run
> * jbd_unlock_bh_state().
> @@ -160,21 +180,25 @@ static void jbd_sync_bh(journal_t *journ
> * buffers. Note that we take the buffers in the opposite ordering
> * from the one in which they were submitted for IO.
> *
> + * Return 0 on success, and return <0 if some buffers have failed
> + * to be written out.
> + *
> * Called with j_list_lock held.
> */
> -static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
> +static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
> {
> struct journal_head *jh;
> struct buffer_head *bh;
> tid_t this_tid;
> int released = 0;
> + int ret = 0;
>
> this_tid = transaction->t_tid;
> restart:
> /* Did somebody clean up the transaction in the meanwhile? */
> if (journal->j_checkpoint_transactions != transaction ||
> transaction->t_tid != this_tid)
> - return;
> + return ret;
> while (!released && transaction->t_checkpoint_io_list) {
> jh = transaction->t_checkpoint_io_list;
> bh = jh2bh(jh);
> @@ -194,6 +218,9 @@ restart:
> spin_lock(&journal->j_list_lock);
> goto restart;
> }
> + if (unlikely(!buffer_uptodate(bh)))
> + ret = -EIO;
> +
> /*
> * Now in whatever state the buffer currently is, we know that
> * it has been written out and so we can drop it from the list
> @@ -203,6 +230,8 @@ restart:
> journal_remove_journal_head(bh);
> __brelse(bh);
> }
> +
> + return ret;
> }
>
> #define NR_BATCH 64
> @@ -226,7 +255,8 @@ __flush_batch(journal_t *journal, struct
> * Try to flush one buffer from the checkpoint list to disk.
> *
> * Return 1 if something happened which requires us to abort the current
> - * scan of the checkpoint list.
> + * scan of the checkpoint list. Return <0 if the buffer has failed to
> + * be written out.
> *
> * Called with j_list_lock held and drops it if 1 is returned
> * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
> @@ -256,6 +286,9 @@ static int __process_buffer(journal_t *j
> log_wait_commit(journal, tid);
> ret = 1;
> } else if (!buffer_dirty(bh)) {
> + ret = 1;
> + if (unlikely(!buffer_uptodate(bh)))
> + ret = -EIO;
> J_ASSERT_JH(jh, !buffer_jbddirty(bh));
> BUFFER_TRACE(bh, "remove from checkpoint");
> __journal_remove_checkpoint(jh);
> @@ -263,7 +296,6 @@ static int __process_buffer(journal_t *j
> jbd_unlock_bh_state(bh);
> journal_remove_journal_head(bh);
> __brelse(bh);
> - ret = 1;
> } else {
> /*
> * Important: we are about to write the buffer, and
> @@ -318,6 +350,7 @@ int log_do_checkpoint(journal_t *journal
> * OK, we need to start writing disk blocks. Take one transaction
> * and write it.
> */
> + result = 0;
> spin_lock(&journal->j_list_lock);
> if (!journal->j_checkpoint_transactions)
> goto out;
> @@ -334,7 +367,7 @@ restart:
> int batch_count = 0;
> struct buffer_head *bhs[NR_BATCH];
> struct journal_head *jh;
> - int retry = 0;
> + int retry = 0, err;
>
> while (!retry && transaction->t_checkpoint_list) {
> struct buffer_head *bh;
> @@ -347,6 +380,8 @@ restart:
> break;
> }
> retry = __process_buffer(journal, jh, bhs,&batch_count);
> + if (retry < 0)
> + result = retry;
> if (!retry && (need_resched() ||
> spin_needbreak(&journal->j_list_lock))) {
> spin_unlock(&journal->j_list_lock);
> @@ -371,14 +406,18 @@ restart:
> * Now we have cleaned up the first transaction's checkpoint
> * list. Let's clean up the second one
> */
> - __wait_cp_io(journal, transaction);
> + err = __wait_cp_io(journal, transaction);
> + if (!result)
> + result = err;
> }
> out:
> spin_unlock(&journal->j_list_lock);
> - result = cleanup_journal_tail(journal);
> if (result < 0)
> - return result;
> - return 0;
> + __journal_abort_checkpoint(journal, result);
> + else
> + result = cleanup_journal_tail(journal);
> +
> + return (result < 0) ? result : 0;
> }
>
> /*
> Index: linux-2.6.26-rc2/include/linux/jbd.h
> ===================================================================
> --- linux-2.6.26-rc2.orig/include/linux/jbd.h
> +++ linux-2.6.26-rc2/include/linux/jbd.h
> @@ -816,6 +816,8 @@ struct journal_s
> #define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */
> #define JFS_LOADED 0x010 /* The journal superblock has been loaded */
> #define JFS_BARRIER 0x020 /* Use IDE barriers */
> +#define JFS_CP_ABORT 0x040 /* Checkpointing has failed. We don't have to
> + * clean the journal space. */
>
> /*
> * Function declarations for the journaling transaction and buffer
> @@ -1004,6 +1006,11 @@ static inline int is_journal_aborted(jou
> return journal->j_flags & JFS_ABORT;
> }
>
> +static inline int is_checkpoint_aborted(journal_t *journal)
> +{
> + return journal->j_flags & JFS_CP_ABORT;
> +}
> +
> static inline int is_handle_aborted(handle_t *handle)
> {
> if (handle->h_aborted)
> Index: linux-2.6.26-rc2/fs/ext3/ioctl.c
> ===================================================================
> --- linux-2.6.26-rc2.orig/fs/ext3/ioctl.c
> +++ linux-2.6.26-rc2/fs/ext3/ioctl.c
> @@ -239,7 +239,7 @@ setrsvsz_out:
> case EXT3_IOC_GROUP_EXTEND: {
> ext3_fsblk_t n_blocks_count;
> struct super_block *sb = inode->i_sb;
> - int err;
> + int err, err2;
>
> if (!capable(CAP_SYS_RESOURCE))
> return -EPERM;
> @@ -254,16 +254,16 @@ setrsvsz_out:
> }
> err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
> journal_lock_updates(EXT3_SB(sb)->s_journal);
> - journal_flush(EXT3_SB(sb)->s_journal);
> + err2 = journal_flush(EXT3_SB(sb)->s_journal);
> journal_unlock_updates(EXT3_SB(sb)->s_journal);
> group_extend_out:
> mnt_drop_write(filp->f_path.mnt);
> - return err;
> + return (err) ? err : err2;
> }
> case EXT3_IOC_GROUP_ADD: {
> struct ext3_new_group_data input;
> struct super_block *sb = inode->i_sb;
> - int err;
> + int err, err2;
>
> if (!capable(CAP_SYS_RESOURCE))
> return -EPERM;
> @@ -280,11 +280,11 @@ group_extend_out:
>
> err = ext3_group_add(sb, &input);
> journal_lock_updates(EXT3_SB(sb)->s_journal);
> - journal_flush(EXT3_SB(sb)->s_journal);
> + err2 = journal_flush(EXT3_SB(sb)->s_journal);
> journal_unlock_updates(EXT3_SB(sb)->s_journal);
> group_add_out:
> mnt_drop_write(filp->f_path.mnt);
> - return err;
> + return (err) ? err : err2;
> }
>
>
> Index: linux-2.6.26-rc2/fs/ext3/super.c
> ===================================================================
> --- linux-2.6.26-rc2.orig/fs/ext3/super.c
> +++ linux-2.6.26-rc2/fs/ext3/super.c
> @@ -395,7 +395,10 @@ static void ext3_put_super (struct super
> ext3_xattr_put_super(sb);
> journal_destroy(sbi->s_journal);
> if (!(sb->s_flags & MS_RDONLY)) {
> - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
> + if (!is_checkpoint_aborted(sbi->s_journal)) {
> + EXT3_CLEAR_INCOMPAT_FEATURE(sb,
> + EXT3_FEATURE_INCOMPAT_RECOVER);
> + }
> es->s_state = cpu_to_le16(sbi->s_mount_state);
> BUFFER_TRACE(sbi->s_sbh, "marking dirty");
> mark_buffer_dirty(sbi->s_sbh);

Is this bit here really needed? If we abort the journal the fs will be mounted
read only and we should never get in here. Is there a case where we could abort
the journal and not be flipped to being read-only? If there is such a case I
would think that we should fix that by making the fs read-only, and not have
this check.

> @@ -2373,7 +2376,13 @@ static void ext3_write_super_lockfs(stru
>
> /* Now we set up the journal barrier. */
> journal_lock_updates(journal);
> - journal_flush(journal);
> +
> + /*
> + * We don't want to clear needs_recovery flag when we failed
> + * to flush the journal.
> + */
> + if (journal_flush(journal) < 0)
> + return;
>
> /* Journal blocked and flushed, clear needs_recovery flag. */
> EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
> Index: linux-2.6.26-rc2/fs/jbd/journal.c
> ===================================================================
> --- linux-2.6.26-rc2.orig/fs/jbd/journal.c
> +++ linux-2.6.26-rc2/fs/jbd/journal.c
> @@ -674,7 +674,7 @@ static journal_t * journal_init_common (
> journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
>
> /* The journal is marked for error until we succeed with recovery! */
> - journal->j_flags = JFS_ABORT;
> + journal->j_flags = JFS_ABORT | JFS_CP_ABORT;
>
> /* Set up a default-sized revoke table for the new mount. */
> err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
> @@ -1107,7 +1107,7 @@ int journal_load(journal_t *journal)
> if (journal_reset(journal))
> goto recovery_error;
>
> - journal->j_flags &= ~JFS_ABORT;
> + journal->j_flags &= ~(JFS_ABORT | JFS_CP_ABORT);
> journal->j_flags |= JFS_LOADED;
> return 0;
>
> @@ -1151,7 +1151,8 @@ void journal_destroy(journal_t *journal)
> journal->j_tail = 0;
> journal->j_tail_sequence = ++journal->j_transaction_sequence;
> if (journal->j_sb_buffer) {
> - journal_update_superblock(journal, 1);
> + if (!is_checkpoint_aborted(journal))
> + journal_update_superblock(journal, 1);
> brelse(journal->j_sb_buffer);
> }
>
> @@ -1333,7 +1334,6 @@ static int journal_convert_superblock_v1
>
> int journal_flush(journal_t *journal)
> {
> - int err = 0;
> transaction_t *transaction = NULL;
> unsigned long old_tail;
>
> @@ -1356,14 +1356,19 @@ int journal_flush(journal_t *journal)
> spin_unlock(&journal->j_state_lock);
> }
>
> - /* ...and flush everything in the log out to disk. */
> + /* ...and flush everything in the log out to disk.
> + * Even if an error occurs, we don't stop this loop.
> + * We do checkpoint as much as possible. */
> spin_lock(&journal->j_list_lock);
> - while (!err && journal->j_checkpoint_transactions != NULL) {
> + while (journal->j_checkpoint_transactions != NULL) {
> spin_unlock(&journal->j_list_lock);
> - err = log_do_checkpoint(journal);
> + log_do_checkpoint(journal);
> spin_lock(&journal->j_list_lock);
> }
> spin_unlock(&journal->j_list_lock);
> + if (is_checkpoint_aborted(journal))
> + return -EIO;
> +
> cleanup_journal_tail(journal);
>
> /* Finally, mark the journal as really needing no recovery.
> @@ -1385,7 +1390,7 @@ int journal_flush(journal_t *journal)
> J_ASSERT(journal->j_head == journal->j_tail);
> J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
> spin_unlock(&journal->j_state_lock);
> - return err;
> + return 0;
> }
>
> /**
> Index: linux-2.6.26-rc2/fs/jbd/recovery.c
> ===================================================================
> --- linux-2.6.26-rc2.orig/fs/jbd/recovery.c
> +++ linux-2.6.26-rc2/fs/jbd/recovery.c
> @@ -223,7 +223,7 @@ do { \
> */
> int journal_recover(journal_t *journal)
> {
> - int err;
> + int err, err2;
> journal_superblock_t * sb;
>
> struct recovery_info info;
> @@ -261,7 +261,10 @@ int journal_recover(journal_t *journal)
> journal->j_transaction_sequence = ++info.end_transaction;
>
> journal_clear_revoke(journal);
> - sync_blockdev(journal->j_fs_dev);
> + err2 = sync_blockdev(journal->j_fs_dev);
> + if (!err)
> + err = err2;
> +
> return err;
> }
>
>
>

Other than that one question it looks good to me. Thanks much,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/