[patch 16/67] jbd: fix commit of ordered data buffers

From: Greg KH
Date: Wed Oct 11 2006 - 17:33:08 EST



-stable review patch. If anyone has any objections, please let us know.

------------------
From: Jan Kara <jack@xxxxxxx>

Original commit code assumes, that when a buffer on BJ_SyncData list is
locked, it is being written to disk. But this is not true and hence it can
lead to a potential data loss on crash. Also the code didn't count with
the fact that journal_dirty_data() can steal buffers from committing
transaction and hence could write buffers that no longer belong to the
committing transaction. Finally it could possibly happen that we tried
writing out one buffer several times.

The patch below tries to solve these problems by a complete rewrite of the
data commit code. We go through buffers on t_sync_datalist, lock buffers
needing write out and store them in an array. Buffers are also immediately
refiled to BJ_Locked list or unfiled (if the write out is completed). When
the array is full or we have to block on buffer lock, we submit all
accumulated buffers for IO.

[suitable for 2.6.18.x around the 2.6.19-rc2 timeframe]

Signed-off-by: Jan Kara <jack@xxxxxxx>
Cc: Badari Pulavarty <pbadari@xxxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxx>

---
fs/jbd/commit.c | 182 ++++++++++++++++++++++++++++++++++----------------------
1 file changed, 113 insertions(+), 69 deletions(-)

--- linux-2.6.18.orig/fs/jbd/commit.c
+++ linux-2.6.18/fs/jbd/commit.c
@@ -160,6 +160,117 @@ static int journal_write_commit_record(j
return (ret == -EIO);
}

+void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+{
+ int i;
+
+ for (i = 0; i < bufs; i++) {
+ wbuf[i]->b_end_io = end_buffer_write_sync;
+ /* We use-up our safety reference in submit_bh() */
+ submit_bh(WRITE, wbuf[i]);
+ }
+}
+
+/*
+ * Submit all the data buffers to disk
+ */
+static void journal_submit_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
+{
+ struct journal_head *jh;
+ struct buffer_head *bh;
+ int locked;
+ int bufs = 0;
+ struct buffer_head **wbuf = journal->j_wbuf;
+
+ /*
+ * Whenever we unlock the journal and sleep, things can get added
+ * onto ->t_sync_datalist, so we have to keep looping back to
+ * write_out_data until we *know* that the list is empty.
+ *
+ * Cleanup any flushed data buffers from the data list. Even in
+ * abort mode, we want to flush this out as soon as possible.
+ */
+write_out_data:
+ cond_resched();
+ spin_lock(&journal->j_list_lock);
+
+ while (commit_transaction->t_sync_datalist) {
+ jh = commit_transaction->t_sync_datalist;
+ bh = jh2bh(jh);
+ locked = 0;
+
+ /* Get reference just to make sure buffer does not disappear
+ * when we are forced to drop various locks */
+ get_bh(bh);
+ /* If the buffer is dirty, we need to submit IO and hence
+ * we need the buffer lock. We try to lock the buffer without
+ * blocking. If we fail, we need to drop j_list_lock and do
+ * blocking lock_buffer().
+ */
+ if (buffer_dirty(bh)) {
+ if (test_set_buffer_locked(bh)) {
+ BUFFER_TRACE(bh, "needs blocking lock");
+ spin_unlock(&journal->j_list_lock);
+ /* Write out all data to prevent deadlocks */
+ journal_do_submit_data(wbuf, bufs);
+ bufs = 0;
+ lock_buffer(bh);
+ spin_lock(&journal->j_list_lock);
+ }
+ locked = 1;
+ }
+ /* We have to get bh_state lock. Again out of order, sigh. */
+ if (!inverted_lock(journal, bh)) {
+ jbd_lock_bh_state(bh);
+ spin_lock(&journal->j_list_lock);
+ }
+ /* Someone already cleaned up the buffer? */
+ if (!buffer_jbd(bh)
+ || jh->b_transaction != commit_transaction
+ || jh->b_jlist != BJ_SyncData) {
+ jbd_unlock_bh_state(bh);
+ if (locked)
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "already cleaned up");
+ put_bh(bh);
+ continue;
+ }
+ if (locked && test_clear_buffer_dirty(bh)) {
+ BUFFER_TRACE(bh, "needs writeout, adding to array");
+ wbuf[bufs++] = bh;
+ __journal_file_buffer(jh, commit_transaction,
+ BJ_Locked);
+ jbd_unlock_bh_state(bh);
+ if (bufs == journal->j_wbufsize) {
+ spin_unlock(&journal->j_list_lock);
+ journal_do_submit_data(wbuf, bufs);
+ bufs = 0;
+ goto write_out_data;
+ }
+ }
+ else {
+ BUFFER_TRACE(bh, "writeout complete: unfile");
+ __journal_unfile_buffer(jh);
+ jbd_unlock_bh_state(bh);
+ if (locked)
+ unlock_buffer(bh);
+ journal_remove_journal_head(bh);
+ /* Once for our safety reference, once for
+ * journal_remove_journal_head() */
+ put_bh(bh);
+ put_bh(bh);
+ }
+
+ if (lock_need_resched(&journal->j_list_lock)) {
+ spin_unlock(&journal->j_list_lock);
+ goto write_out_data;
+ }
+ }
+ spin_unlock(&journal->j_list_lock);
+ journal_do_submit_data(wbuf, bufs);
+}
+
/*
* journal_commit_transaction
*
@@ -313,80 +424,13 @@ void journal_commit_transaction(journal_
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
-
err = 0;
- /*
- * Whenever we unlock the journal and sleep, things can get added
- * onto ->t_sync_datalist, so we have to keep looping back to
- * write_out_data until we *know* that the list is empty.
- */
- bufs = 0;
- /*
- * Cleanup any flushed data buffers from the data list. Even in
- * abort mode, we want to flush this out as soon as possible.
- */
-write_out_data:
- cond_resched();
- spin_lock(&journal->j_list_lock);
-
- while (commit_transaction->t_sync_datalist) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_sync_datalist;
- commit_transaction->t_sync_datalist = jh->b_tnext;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- BUFFER_TRACE(bh, "locked");
- if (!inverted_lock(journal, bh))
- goto write_out_data;
- __journal_temp_unlink_buffer(jh);
- __journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- if (lock_need_resched(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
- }
- } else {
- if (buffer_dirty(bh)) {
- BUFFER_TRACE(bh, "start journal writeout");
- get_bh(bh);
- wbuf[bufs++] = bh;
- if (bufs == journal->j_wbufsize) {
- jbd_debug(2, "submit %d writes\n",
- bufs);
- spin_unlock(&journal->j_list_lock);
- ll_rw_block(SWRITE, bufs, wbuf);
- journal_brelse_array(wbuf, bufs);
- bufs = 0;
- goto write_out_data;
- }
- } else {
- BUFFER_TRACE(bh, "writeout complete: unfile");
- if (!inverted_lock(journal, bh))
- goto write_out_data;
- __journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
- put_bh(bh);
- if (lock_need_resched(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
- }
- }
- }
- }
-
- if (bufs) {
- spin_unlock(&journal->j_list_lock);
- ll_rw_block(SWRITE, bufs, wbuf);
- journal_brelse_array(wbuf, bufs);
- spin_lock(&journal->j_list_lock);
- }
+ journal_submit_data_buffers(journal, commit_transaction);

/*
* Wait for all previously submitted IO to complete.
*/
+ spin_lock(&journal->j_list_lock);
while (commit_transaction->t_locked_list) {
struct buffer_head *bh;


--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/