Re: next-20090310: ext4 hangs

From: Alexander Beregalov
Date: Wed Mar 25 2009 - 13:08:46 EST


2009/3/25 Jan Kara <jack@xxxxxxx>:
> On Wed 25-03-09 18:29:10, Alexander Beregalov wrote:
>> 2009/3/25 Jan Kara <jack@xxxxxxx>:
>> > On Wed 25-03-09 18:18:43, Alexander Beregalov wrote:
>> >> 2009/3/25 Jan Kara <jack@xxxxxxx>:
>> >> >> > So, I think I need to try it on 2.6.29-rc7 again.
>> >> >> Â I've looked into this. Obviously, what's happenning is that we delete
>> >> >> an inode and jbd2_journal_release_jbd_inode() finds inode is just under
>> >> >> writeout in transaction commit and thus it waits. But it gets never woken
>> >> >> up and because it has a handle from the transaction, every one eventually
>> >> >> blocks on waiting for a transaction to finish.
>> >> >> Â But I don't really see how that can happen. The code is really
>> >> >> straightforward and everything happens under j_list_lock... Strange.
>> >> > ÂBTW: Is the system SMP?
>> >> No, it is UP system.
>> > ÂEven stranger. And do you have CONFIG_PREEMPT set?
>> >
>> >> The bug exists even in 2.6.29, I posted it with a new topic.
>> > ÂOK, I've sort-of expected this.
>>
>> CONFIG_PREEMPT_RCU=y
>> CONFIG_PREEMPT_RCU_TRACE=y
>> # CONFIG_PREEMPT_NONE is not set
>> # CONFIG_PREEMPT_VOLUNTARY is not set
>> CONFIG_PREEMPT=y
>> CONFIG_DEBUG_PREEMPT=y
>> # CONFIG_PREEMPT_TRACER is not set
>>
>> config is attached.
> ÂThanks for the data. I still don't see how the wakeup can get lost. The
> process even cannot be preempted when we are in the section protected by
> j_list_lock... Can you send me a disassembly of functions
> jbd2_journal_release_jbd_inode() and journal_submit_data_buffers() so that
> I can see whether the compiler has not reordered something unexpectedly?

void jbd2_journal_release_jbd_inode(journal_t *journal,
struct jbd2_inode *jinode)
{
6d8: 9d e3 bf 00 save %sp, -256, %sp
6dc: 11 00 00 00 sethi %hi(0), %o0
6e0: 40 00 00 00 call 6e0 <jbd2_journal_release_jbd_inode+0x8>
6e4: 90 12 20 00 mov %o0, %o0 ! 0 <jbd2_history_skip_empty>
int writeout = 0;

if (!journal)
6e8: 02 c6 00 30 brz,pn %i0, 7a8
<jbd2_journal_release_jbd_inode+0xd0>
6ec: 03 00 00 00 sethi %hi(0), %g1
return;
restart:
spin_lock(&journal->j_list_lock);
6f0: b0 06 25 70 add %i0, 0x570, %i0
/* Is commit writing out inode - we have to wait */
if (jinode->i_flags & JI_COMMIT_RUNNING) {
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
6f4: aa 10 60 00 mov %g1, %l5
6f8: a2 06 60 28 add %i1, 0x28, %l1
6fc: a8 07 a7 b7 add %fp, 0x7b7, %l4
700: a6 07 a7 df add %fp, 0x7df, %l3
wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
704: a4 07 a7 c7 add %fp, 0x7c7, %l2
int writeout = 0;

if (!journal)
return;
restart:
spin_lock(&journal->j_list_lock);
708: 40 00 00 00 call 708 <jbd2_journal_release_jbd_inode+0x30>
70c: 90 10 00 18 mov %i0, %o0
/* Is commit writing out inode - we have to wait */
if (jinode->i_flags & JI_COMMIT_RUNNING) {
710: c2 06 60 28 ld [ %i1 + 0x28 ], %g1
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
714: 94 10 20 38 mov 0x38, %o2
718: 90 10 00 14 mov %l4, %o0
71c: 92 10 20 00 clr %o1
if (!journal)
return;
restart:
spin_lock(&journal->j_list_lock);
/* Is commit writing out inode - we have to wait */
if (jinode->i_flags & JI_COMMIT_RUNNING) {
720: 80 88 60 01 btst 1, %g1
724: 02 60 00 19 be,pn %xcc, 788
<jbd2_journal_release_jbd_inode+0xb0>
728: a0 10 00 04 mov %g4, %l0
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
72c: 40 00 00 00 call 72c <jbd2_journal_release_jbd_inode+0x54>
730: 01 00 00 00 nop
wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
734: 90 10 00 11 mov %l1, %o0
738: 92 10 20 00 clr %o1
restart:
spin_lock(&journal->j_list_lock);
/* Is commit writing out inode - we have to wait */
if (jinode->i_flags & JI_COMMIT_RUNNING) {
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
73c: e0 77 a7 cf stx %l0, [ %fp + 0x7cf ]
740: e2 77 a7 b7 stx %l1, [ %fp + 0x7b7 ]
744: ea 77 a7 d7 stx %l5, [ %fp + 0x7d7 ]
748: e6 77 a7 df stx %l3, [ %fp + 0x7df ]
wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
74c: 40 00 00 00 call 74c <jbd2_journal_release_jbd_inode+0x74>
750: e6 77 a7 e7 stx %l3, [ %fp + 0x7e7 ]
prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
754: 92 10 00 12 mov %l2, %o1
758: 94 10 20 02 mov 2, %o2
75c: 40 00 00 00 call 75c <jbd2_journal_release_jbd_inode+0x84>
760: a0 10 00 08 mov %o0, %l0
spin_unlock(&journal->j_list_lock);
764: 40 00 00 00 call 764 <jbd2_journal_release_jbd_inode+0x8c>
768: 90 10 00 18 mov %i0, %o0
schedule();
76c: 40 00 00 00 call 76c <jbd2_journal_release_jbd_inode+0x94>
770: 01 00 00 00 nop
finish_wait(wq, &wait.wait);
774: 90 10 00 10 mov %l0, %o0
778: 40 00 00 00 call 778 <jbd2_journal_release_jbd_inode+0xa0>
77c: 92 10 00 12 mov %l2, %o1
780: 10 6f ff e2 b %xcc, 708
<jbd2_journal_release_jbd_inode+0x30>
784: 01 00 00 00 nop
}

/* Do we need to wait for data writeback? */
if (journal->j_committing_transaction == jinode->i_transaction)
writeout = 1;
if (jinode->i_transaction) {
788: c2 5e 40 00 ldx [ %i1 ], %g1
78c: 02 c0 40 05 brz,pn %g1, 7a0
<jbd2_journal_release_jbd_inode+0xc8>
790: 01 00 00 00 nop
list_del(&jinode->i_list);
794: 40 00 00 00 call 794 <jbd2_journal_release_jbd_inode+0xbc>
798: 90 06 60 10 add %i1, 0x10, %o0
jinode->i_transaction = NULL;
79c: c0 76 40 00 clrx [ %i1 ]
}
spin_unlock(&journal->j_list_lock);
7a0: 40 00 00 00 call 7a0 <jbd2_journal_release_jbd_inode+0xc8>
7a4: 90 10 00 18 mov %i0, %o0
7a8: 81 cf e0 08 rett %i7 + 8
7ac: 01 00 00 00 nop

====
By default gcc inlines journal_submit_data_buffers()
Here is -fno-inline version. Default version is in attach.
====

static int journal_submit_data_buffers(journal_t *journal,
transaction_t *commit_transaction)
{
9c: 9d e3 bf 40 save %sp, -192, %sp
a0: 11 00 00 00 sethi %hi(0), %o0
struct jbd2_inode *jinode;
int err, ret = 0;
struct address_space *mapping;

spin_lock(&journal->j_list_lock);
a4: a4 06 25 70 add %i0, 0x570, %l2
* our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
* operate on from being released while we write out pages.
*/
static int journal_submit_data_buffers(journal_t *journal,
transaction_t *commit_transaction)
{
a8: 90 12 20 00 mov %o0, %o0
ac: 40 00 00 00 call ac <journal_submit_data_buffers+0x10>
b0: b0 10 20 00 clr %i0
struct jbd2_inode *jinode;
int err, ret = 0;
struct address_space *mapping;

spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
b4: a6 06 60 60 add %i1, 0x60, %l3
{
struct jbd2_inode *jinode;
int err, ret = 0;
struct address_space *mapping;

spin_lock(&journal->j_list_lock);
b8: 40 00 00 00 call b8 <journal_submit_data_buffers+0x1c>
bc: 90 10 00 12 mov %l2, %o0
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
c0: 10 68 00 1d b %xcc, 134 <journal_submit_data_buffers+0x98>
c4: c2 5e 60 60 ldx [ %i1 + 0x60 ], %g1
mapping = jinode->i_vfs_inode->i_mapping;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
c8: 90 10 00 12 mov %l2, %o0
struct address_space *mapping;

spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
mapping = jinode->i_vfs_inode->i_mapping;
jinode->i_flags |= JI_COMMIT_RUNNING;
cc: c2 04 60 28 ld [ %l1 + 0x28 ], %g1
int err, ret = 0;
struct address_space *mapping;

spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
mapping = jinode->i_vfs_inode->i_mapping;
d0: e0 58 a1 e0 ldx [ %g2 + 0x1e0 ], %l0
jinode->i_flags |= JI_COMMIT_RUNNING;
d4: 82 10 60 01 or %g1, 1, %g1
spin_unlock(&journal->j_list_lock);
d8: 40 00 00 00 call d8 <journal_submit_data_buffers+0x3c>
dc: c2 24 60 28 st %g1, [ %l1 + 0x28 ]
* submit the inode data buffers. We use writepage
* instead of writepages. Because writepages can do
* block allocation with delalloc. We need to write
* only allocated blocks here.
*/
err = journal_submit_inode_data_buffers(mapping);
e0: 7f ff ff d3 call 2c <journal_submit_inode_data_buffers>
e4: 90 10 00 10 mov %l0, %o0
if (!ret)
e8: 80 a6 20 00 cmp %i0, 0
ec: b1 64 40 08 move %icc, %o0, %i0
ret = err;
spin_lock(&journal->j_list_lock);
f0: 40 00 00 00 call f0 <journal_submit_data_buffers+0x54>
f4: 90 10 00 12 mov %l2, %o0
J_ASSERT(jinode->i_transaction == commit_transaction);
f8: c2 5c 40 00 ldx [ %l1 ], %g1
fc: 80 a0 40 19 cmp %g1, %i1
100: 22 68 00 07 be,a %xcc, 11c
<journal_submit_data_buffers+0x80>
104: c2 04 60 28 ld [ %l1 + 0x28 ], %g1
108: 11 00 00 00 sethi %hi(0), %o0
10c: 92 10 21 04 mov 0x104, %o1
110: 40 00 00 00 call 110 <journal_submit_data_buffers+0x74>
114: 90 12 20 00 mov %o0, %o0
118: 91 d0 20 05 ta 5
jinode->i_flags &= ~JI_COMMIT_RUNNING;
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
11c: 90 04 60 28 add %l1, 0x28, %o0
120: 92 10 20 00 clr %o1
err = journal_submit_inode_data_buffers(mapping);
if (!ret)
ret = err;
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
jinode->i_flags &= ~JI_COMMIT_RUNNING;
124: 82 08 7f fe and %g1, -2, %g1
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
128: 40 00 00 00 call 128 <journal_submit_data_buffers+0x8c>
12c: c2 24 60 28 st %g1, [ %l1 + 0x28 ]
struct jbd2_inode *jinode;
int err, ret = 0;
struct address_space *mapping;

spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
130: c2 5c 60 10 ldx [ %l1 + 0x10 ], %g1
134: a2 00 7f f0 add %g1, -16, %l1
* prefetches into the prefetch-cache which only is accessible
* by floating point operations in UltraSPARC-III and later.
* By contrast, "#one_write" prefetches into the L2 cache
* in shared state.
*/
__asm__ __volatile__("prefetch [%0], #one_write"
138: c2 5c 60 10 ldx [ %l1 + 0x10 ], %g1
13c: c7 68 40 00 prefetch [ %g1 ], #one_write
140: 82 04 60 10 add %l1, 0x10, %g1
144: 80 a4 c0 01 cmp %l3, %g1
148: 32 6f ff e0 bne,a %xcc, c8
<journal_submit_data_buffers+0x2c>
14c: c4 5c 60 20 ldx [ %l1 + 0x20 ], %g2
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
spin_unlock(&journal->j_list_lock);
150: 90 10 00 12 mov %l2, %o0
154: 40 00 00 00 call 154 <journal_submit_data_buffers+0xb8>
158: b1 3e 20 00 sra %i0, 0, %i0
return ret;
}
15c: 81 cf e0 08 rett %i7 + 8
160: 01 00 00 00 nop

Attachment: jbd2-commit-noinline.out
Description: Binary data