Possible fix for lockup in drop_caches
From: richard
Date: Mon Dec 17 2007 - 07:13:39 EST
Andrew Morton wrote:
> On Mon, 3 Dec 2007 16:52:47 +0300
> "Denis V. Lunev" <den@xxxxxxxxxx> wrote:
>
>> There is a AB-BA deadlock regarding drop_caches sysctl. Here are the
code
>> paths:
snip...
> One way to fix jbd (and jbd2) would be:
>
> static void __journal_temp_unlink_buffer(struct journal_head *jh,
> struct buffer_head **bh_to_dirty)
> {
> *bh_to_dirty = NULL;
> ...
> if (test_clear_buffer_jbddirty(bh))
> *bh_to_dirty = bh;
> }
>
> {
> struct buffer_head *bh_to_dirty; /* probably needs
uninitialized_var() */
>
> ...
> __journal_temp_unlink_buffer(jh, &bh_to_dirty);
> ...
> jbd_mark_buffer_dirty(bh_to_dirty);
> brelse(bh_to_dirty);
> ...
> }
>
> static inline void jbd_mark_buffer_dirty(struct buffer_head *bh)
> {
> if (bh)
> mark_buffer_dirty(bh);
> }
>
>
> --
Hi Andrew,
here's an implementation of your suggested approach for jbd.
Without this patch my test-case will lockup in 3 to 5 iterations, with
the patch I have completed 96+ runs.
I don't see any significant difference in write performance with this
patch applied. Subjectively I feel that my system is more responsive
when under heavy write loads but I don't have data to back that up, so
it might just be wishful thinking on my part!
I've tested this on AMD 64x2 SMP 2.6.24-rc*
the patch is against 2.6.24-rc5.
Cheers
Richard
fs/jbd/commit.c | 19 +++++++++++---
fs/jbd/transaction.c | 66 +++++++++++++++++++++++++++++++++++----------------
include/linux/jbd.h | 11 ++++++--
3 files changed, 70 insertions(+), 26 deletions(-)
----
commit 5b3824aa42a07682777836814fc7d1882416c6d3
Author: Richard Kennedy <richard@xxxxxxxxxxxxxxx>
Date: Mon Dec 17 12:05:36 2007 +0000
fix lockup in when calling drop_caches
calling /proc/sys/vm/drop_caches can hang due to a AB/BA lock dependency
between j_list_lock and the inode_lock. This patch moves the redirtying of the buffer head out
from under the j_list_lock.
based on a suggestion by Andrew Morton.
Signed-off-by: Richard Kennedy <richard@xxxxxxxxxxxxxxx>
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 610264b..9115b5d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -181,6 +181,7 @@ static void journal_submit_data_buffers(journal_t *journal,
int locked;
int bufs = 0;
struct buffer_head **wbuf = journal->j_wbuf;
+ struct buffer_head *dirty_bh;
/*
* Whenever we unlock the journal and sleep, things can get added
@@ -254,7 +255,8 @@ write_out_data:
put_bh(bh);
} else {
BUFFER_TRACE(bh, "writeout complete: unfile");
- __journal_unfile_buffer(jh);
+ __journal_unfile_buffer(jh, &dirty_bh);
+ jbd_mark_buffer_dirty(dirty_bh);
jbd_unlock_bh_state(bh);
if (locked)
unlock_buffer(bh);
@@ -296,6 +298,7 @@ void journal_commit_transaction(journal_t *journal)
int first_tag = 0;
int tag_flag;
int i;
+ struct buffer_head *dirty_bh;
/*
* First job: lock down the current transaction and wait for
@@ -453,7 +456,9 @@ void journal_commit_transaction(journal_t *journal)
continue;
}
if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
- __journal_unfile_buffer(jh);
+ struct buffer_head *dirty_bh;
+ __journal_unfile_buffer(jh, &dirty_bh);
+ jbd_mark_buffer_dirty(dirty_bh);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
put_bh(bh);
@@ -833,7 +838,12 @@ restart_loop:
JBUFFER_TRACE(jh, "add to new checkpointing trans");
__journal_insert_checkpoint(jh, commit_transaction);
JBUFFER_TRACE(jh, "refile for checkpoint writeback");
- __journal_refile_buffer(jh);
+ __journal_refile_buffer(jh, &dirty_bh);
+ if (dirty_bh) {
+ spin_unlock(&journal->j_list_lock);
+ jbd_mark_buffer_dirty(dirty_bh);
+ spin_lock(&journal->j_list_lock);
+ }
jbd_unlock_bh_state(bh);
} else {
J_ASSERT_BH(bh, !buffer_dirty(bh));
@@ -845,7 +855,8 @@ restart_loop:
* disk and before we process the buffer on BJ_Forget
* list. */
JBUFFER_TRACE(jh, "refile or unfile freed buffer");
- __journal_refile_buffer(jh);
+ /* As !jbddirty dirty_bh will always be null so we can ignore it */
+ __journal_refile_buffer(jh, &dirty_bh);
if (!jh->b_transaction) {
jbd_unlock_bh_state(bh);
/* needs a brelse */
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 08ff6c7..5bf6202 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -26,7 +26,8 @@
#include <linux/mm.h>
#include <linux/highmem.h>
-static void __journal_temp_unlink_buffer(struct journal_head *jh);
+static void __journal_temp_unlink_buffer(struct journal_head *jh,
+ struct buffer_head **bh);
/*
* get_transaction: obtain a new transaction_t object.
@@ -938,6 +939,7 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
journal_t *journal = handle->h_transaction->t_journal;
int need_brelse = 0;
struct journal_head *jh;
+ struct buffer_head *dirty_bh = NULL;
if (is_handle_aborted(handle))
return 0;
@@ -1054,7 +1056,7 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
/* journal_clean_data_list() may have got there first */
if (jh->b_transaction != NULL) {
JBUFFER_TRACE(jh, "unfile from commit");
- __journal_temp_unlink_buffer(jh);
+ __journal_temp_unlink_buffer(jh, &dirty_bh);
/* It still points to the committing
* transaction; move it to this one so
* that the refile assert checks are
@@ -1073,7 +1075,7 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "not on correct data list: unfile");
J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- __journal_temp_unlink_buffer(jh);
+ __journal_temp_unlink_buffer(jh, &dirty_bh);
jh->b_transaction = handle->h_transaction;
JBUFFER_TRACE(jh, "file as data");
__journal_file_buffer(jh, handle->h_transaction,
@@ -1085,6 +1087,7 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
}
no_journal:
spin_unlock(&journal->j_list_lock);
+ jbd_mark_buffer_dirty(dirty_bh);
jbd_unlock_bh_state(bh);
if (need_brelse) {
BUFFER_TRACE(bh, "brelse");
@@ -1217,6 +1220,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
struct journal_head *jh;
+ struct buffer_head *dirty_bh = NULL;
int drop_reserve = 0;
int err = 0;
@@ -1269,10 +1273,12 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
*/
if (jh->b_cp_transaction) {
- __journal_temp_unlink_buffer(jh);
+ __journal_temp_unlink_buffer(jh, &dirty_bh);
+ jbd_mark_buffer_dirty(dirty_bh);
__journal_file_buffer(jh, transaction, BJ_Forget);
} else {
- __journal_unfile_buffer(jh);
+ __journal_unfile_buffer(jh, &dirty_bh);
+ jbd_mark_buffer_dirty(dirty_bh);
journal_remove_journal_head(bh);
__brelse(bh);
if (!buffer_jbd(bh)) {
@@ -1507,8 +1513,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Generally the caller needs to re-read the pointer from the transaction_t.
*
* Called under j_list_lock. The journal may not be locked.
+ * returns a buffer head that needs to be marked dirty
*/
-static void __journal_temp_unlink_buffer(struct journal_head *jh)
+static void __journal_temp_unlink_buffer(struct journal_head *jh,
+ struct buffer_head **dirty_bh)
{
struct journal_head **list = NULL;
transaction_t *transaction;
@@ -1523,6 +1531,7 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh)
if (jh->b_jlist != BJ_None)
J_ASSERT_JH(jh, transaction != NULL);
+ *dirty_bh = NULL;
switch (jh->b_jlist) {
case BJ_None:
return;
@@ -1557,21 +1566,24 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh)
__blist_del_buffer(list, jh);
jh->b_jlist = BJ_None;
if (test_clear_buffer_jbddirty(bh))
- mark_buffer_dirty(bh); /* Expose it to the VM */
+ *dirty_bh = bh; /* bh needs to be dirtied to expose it to the vm */
}
-void __journal_unfile_buffer(struct journal_head *jh)
+void __journal_unfile_buffer(struct journal_head *jh,
+ struct buffer_head **dirty_bh)
{
- __journal_temp_unlink_buffer(jh);
+ __journal_temp_unlink_buffer(jh, dirty_bh);
jh->b_transaction = NULL;
}
void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
{
+ struct buffer_head *bh;
jbd_lock_bh_state(jh2bh(jh));
spin_lock(&journal->j_list_lock);
- __journal_unfile_buffer(jh);
+ __journal_unfile_buffer(jh, &bh);
spin_unlock(&journal->j_list_lock);
+ jbd_mark_buffer_dirty(bh);
jbd_unlock_bh_state(jh2bh(jh));
}
@@ -1584,6 +1596,8 @@ static void
__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
{
struct journal_head *jh;
+ int need_brelse = 0;
+ struct buffer_head *dirty_bh = NULL;
jh = bh2jh(bh);
@@ -1598,9 +1612,9 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
/* A written-back ordered data buffer */
JBUFFER_TRACE(jh, "release data");
- __journal_unfile_buffer(jh);
+ __journal_unfile_buffer(jh, &dirty_bh);
journal_remove_journal_head(bh);
- __brelse(bh);
+ need_brelse = 1;
}
} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
@@ -1608,10 +1622,13 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
JBUFFER_TRACE(jh, "remove from checkpoint list");
__journal_remove_checkpoint(jh);
journal_remove_journal_head(bh);
- __brelse(bh);
+ need_brelse = 1;
}
}
spin_unlock(&journal->j_list_lock);
+ jbd_mark_buffer_dirty(dirty_bh);
+ if (need_brelse)
+ __brelse(bh);
out:
return;
}
@@ -1702,8 +1719,10 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
{
int may_free = 1;
struct buffer_head *bh = jh2bh(jh);
+ struct buffer_head *dirty_bh;
- __journal_unfile_buffer(jh);
+ __journal_unfile_buffer(jh, &dirty_bh);
+ jbd_mark_buffer_dirty(dirty_bh);
if (jh->b_cp_transaction) {
JBUFFER_TRACE(jh, "on running+cp transaction");
@@ -1956,6 +1975,7 @@ void __journal_file_buffer(struct journal_head *jh,
struct journal_head **list = NULL;
int was_dirty = 0;
struct buffer_head *bh = jh2bh(jh);
+ struct buffer_head *dirty_bh;
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1978,8 +1998,10 @@ void __journal_file_buffer(struct journal_head *jh,
was_dirty = 1;
}
- if (jh->b_transaction)
- __journal_temp_unlink_buffer(jh);
+ if (jh->b_transaction) {
+ __journal_temp_unlink_buffer(jh, &dirty_bh);
+ jbd_mark_buffer_dirty(dirty_bh);
+ }
jh->b_transaction = transaction;
switch (jlist) {
@@ -2041,7 +2063,8 @@ void journal_file_buffer(struct journal_head *jh,
*
* Called under jbd_lock_bh_state(jh2bh(jh))
*/
-void __journal_refile_buffer(struct journal_head *jh)
+void __journal_refile_buffer(struct journal_head *jh,
+ struct buffer_head **dirty_bh)
{
int was_dirty;
struct buffer_head *bh = jh2bh(jh);
@@ -2052,7 +2075,7 @@ void __journal_refile_buffer(struct journal_head *jh)
/* If the buffer is now unused, just drop it. */
if (jh->b_next_transaction == NULL) {
- __journal_unfile_buffer(jh);
+ __journal_unfile_buffer(jh, dirty_bh);
return;
}
@@ -2062,7 +2085,8 @@ void __journal_refile_buffer(struct journal_head *jh)
*/
was_dirty = test_clear_buffer_jbddirty(bh);
- __journal_temp_unlink_buffer(jh);
+ /* as we just cleared jbddirty we could safely ignore dirty_bh */
+ __journal_temp_unlink_buffer(jh, dirty_bh);
jh->b_transaction = jh->b_next_transaction;
jh->b_next_transaction = NULL;
__journal_file_buffer(jh, jh->b_transaction,
@@ -2090,14 +2114,16 @@ void __journal_refile_buffer(struct journal_head *jh)
void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
struct buffer_head *bh = jh2bh(jh);
+ struct buffer_head *dirty_bh;
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
- __journal_refile_buffer(jh);
+ __journal_refile_buffer(jh, &dirty_bh);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
spin_unlock(&journal->j_list_lock);
+ jbd_mark_buffer_dirty(dirty_bh);
__brelse(bh);
}
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index d9ecd13..8c22a33 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -835,14 +835,21 @@ struct journal_s
/* Filing buffers */
extern void journal_unfile_buffer(journal_t *, struct journal_head *);
-extern void __journal_unfile_buffer(struct journal_head *);
-extern void __journal_refile_buffer(struct journal_head *);
+extern void __journal_unfile_buffer(struct journal_head *,
+ struct buffer_head **dirty_bh);
+extern void __journal_refile_buffer(struct journal_head *,
+ struct buffer_head **dirty_bh);
extern void journal_refile_buffer(journal_t *, struct journal_head *);
extern void __journal_file_buffer(struct journal_head *, transaction_t *, int);
extern void __journal_free_buffer(struct journal_head *bh);
extern void journal_file_buffer(struct journal_head *, transaction_t *, int);
extern void __journal_clean_data_list(transaction_t *transaction);
+static inline void jbd_mark_buffer_dirty(struct buffer_head *bh)
+{
+ if (bh)
+ mark_buffer_dirty(bh);
+}
/* Log buffer allocation */
extern struct journal_head * journal_get_descriptor_buffer(journal_t *);
int journal_next_log_block(journal_t *, unsigned long *);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/