Re: [PATCH cgroup/for-4.3-fixes] cgroup, writeback: don't enable cgroup writeback on traditional hierarchies

From: Tejun Heo
Date: Fri Sep 25 2015 - 11:49:35 EST


Hello, Artem.

On Fri, Sep 25, 2015 at 01:50:22PM +0300, Artem Bityutskiy wrote:
> > Does not compile with multiple errors like
> >
> > linux/fs/fs-writeback.c:799:10: error: âstruct bdi_writebackâ has no
> > member named âlast_comp_genâ
> > bdi->wb.last_comp_gen = bdi->wb.comp_gen;
>
> I tried to extend your patch with these fields, but I am not sure I got
> it right, so please, send a new patch, I'll run the reboot corruption
> test with your patch.

Oops, sorry, I'm an idiot.

> Please, note, because this test is about reboots, I'll probably output
> everything to the serial console. Therefore, please, do not print too
> much data. Otherwise I'd have to modify my scripts to collect dmesg
> before restarting, which is more work.

Hmmm... I'm gonna add ratelimit on inode printouts; other than that,
it shouldn't print too much.

Thanks.

---
fs/fs-writeback.c | 154 +++++++++++++++++++++++++++++++++++++--
fs/inode.c | 1
include/linux/backing-dev-defs.h | 2
include/linux/backing-dev.h | 20 ++++-
include/linux/fs.h | 2
mm/backing-dev.c | 2
6 files changed, 173 insertions(+), 8 deletions(-)

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -101,7 +101,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepa

static bool wb_io_lists_populated(struct bdi_writeback *wb)
{
- if (wb_has_dirty_io(wb)) {
+ if (test_bit(WB_has_dirty_io, &wb->state)) {
return false;
} else {
set_bit(WB_has_dirty_io, &wb->state);
@@ -763,6 +763,15 @@ static long wb_split_bdi_pages(struct bd
return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
}

+extern spinlock_t cgwb_lock;
+
+struct split_work_dbg {
+ DECLARE_BITMAP(all_wbs, 8192);
+ DECLARE_BITMAP(iterated_wbs, 8192);
+ DECLARE_BITMAP(written_wbs, 8192);
+ DECLARE_BITMAP(sync_wbs, 8192);
+};
+
/**
* bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
* @bdi: target backing_dev_info
@@ -776,11 +785,25 @@ static long wb_split_bdi_pages(struct bd
*/
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
- bool skip_if_busy)
+ bool skip_if_busy, struct split_work_dbg *dbg)
{
int next_memcg_id = 0;
struct bdi_writeback *wb;
struct wb_iter iter;
+ struct radix_tree_iter riter;
+ void **slot;
+
+ if (dbg) {
+ spin_lock_irq(&cgwb_lock);
+ set_bit(bdi->wb.memcg_css->id, dbg->all_wbs);
+ bdi->wb.last_comp_gen = bdi->wb.comp_gen;
+ radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &riter, 0) {
+ wb = *slot;
+ set_bit(wb->memcg_css->id, dbg->all_wbs);
+ wb->last_comp_gen = wb->comp_gen;
+ }
+ spin_unlock_irq(&cgwb_lock);
+ }

might_sleep();
restart:
@@ -791,6 +814,9 @@ restart:
struct wb_writeback_work *work;
long nr_pages;

+ if (dbg)
+ set_bit(wb->memcg_css->id, dbg->iterated_wbs);
+
/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
@@ -799,6 +825,9 @@ restart:
if (skip_if_busy && writeback_in_progress(wb))
continue;

+ if (dbg)
+ set_bit(wb->memcg_css->id, dbg->written_wbs);
+
nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);

work = kmalloc(sizeof(*work), GFP_ATOMIC);
@@ -817,6 +846,8 @@ restart:
work->auto_free = 0;
work->done = &fallback_work_done;

+ if (dbg)
+ set_bit(wb->memcg_css->id, dbg->sync_wbs);
wb_queue_work(wb, work);

next_memcg_id = wb->memcg_css->id + 1;
@@ -1425,6 +1456,9 @@ static long writeback_sb_inodes(struct s
break;
}

+ inode->i_dbg_marker = 0;
+ inode->i_dbg_marker2 = 0;
+
/*
* Don't bother with new inodes or inodes being freed, first
* kind does not need periodic writeout yet, and for the latter
@@ -1515,6 +1549,7 @@ static long writeback_sb_inodes(struct s
break;
}
}
+
return wrote;
}

@@ -1574,6 +1609,28 @@ static long writeback_inodes_wb(struct b
return nr_pages - work.nr_pages;
}

+static int inode_which_wb_io_list(struct inode *inode, struct backing_dev_info *bdi)
+{
+ struct bdi_writeback *wb = inode->i_wb ?: &bdi->wb;
+ struct inode *pos;
+
+ if (list_empty(&inode->i_io_list))
+ return 0;
+ list_for_each_entry(pos, &wb->b_dirty, i_io_list)
+ if (pos == inode)
+ return 1;
+ list_for_each_entry(pos, &wb->b_io, i_io_list)
+ if (pos == inode)
+ return 2;
+ list_for_each_entry(pos, &wb->b_more_io, i_io_list)
+ if (pos == inode)
+ return 3;
+ list_for_each_entry(pos, &wb->b_dirty_time, i_io_list)
+ if (pos == inode)
+ return 4;
+ return 5;
+}
+
/*
* Explicit flushing or periodic writeback of "old" data.
*
@@ -1604,6 +1661,16 @@ static long wb_writeback(struct bdi_writ

blk_start_plug(&plug);
spin_lock(&wb->list_lock);
+
+ list_for_each_entry(inode, &wb->b_dirty, i_io_list)
+ inode->i_dbg_marker2 = 1;
+ list_for_each_entry(inode, &wb->b_io, i_io_list)
+ inode->i_dbg_marker2 = 2;
+ list_for_each_entry(inode, &wb->b_more_io, i_io_list)
+ inode->i_dbg_marker2 = 3;
+ list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
+ inode->i_dbg_marker2 = 4;
+
for (;;) {
/*
* Stop writeback when nr_pages has been consumed
@@ -1681,6 +1748,24 @@ static long wb_writeback(struct bdi_writ
spin_lock(&wb->list_lock);
}
}
+
+ if (work->sync_mode == WB_SYNC_ALL) {
+ list_for_each_entry(inode, &wb->b_dirty, i_io_list)
+ if (inode->i_dbg_marker2)
+ printk("XXX wb_writeback: inode %lu marker2=%d on b_dirty\n",
+ inode->i_ino, inode->i_dbg_marker2);
+ list_for_each_entry(inode, &wb->b_io, i_io_list)
+ printk("XXX wb_writeback: inode %lu marker2=%d on b_io\n",
+ inode->i_ino, inode->i_dbg_marker2);
+ list_for_each_entry(inode, &wb->b_more_io, i_io_list)
+ printk("XXX wb_writeback: inode %lu marker2=%d on b_more_io\n",
+ inode->i_ino, inode->i_dbg_marker2);
+ list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
+ if (inode->i_dbg_marker2)
+ printk("XXX wb_writeback: inode %lu marker2=%d on b_dirty_time\n",
+ inode->i_ino, inode->i_dbg_marker2);
+ }
+
spin_unlock(&wb->list_lock);
blk_finish_plug(&plug);

@@ -1785,8 +1870,11 @@ static long wb_do_writeback(struct bdi_w

if (work->auto_free)
kfree(work);
- if (done && atomic_dec_and_test(&done->cnt))
- wake_up_all(&wb->bdi->wb_waitq);
+ if (done) {
+ wb->comp_gen++;
+ if (atomic_dec_and_test(&done->cnt))
+ wake_up_all(&wb->bdi->wb_waitq);
+ }
}

/*
@@ -1976,6 +2064,9 @@ void __mark_inode_dirty(struct inode *in

trace_writeback_mark_inode_dirty(inode, flags);

+ WARN_ON_ONCE(!(sb->s_flags & MS_LAZYTIME) &&
+ !list_empty(&inode_to_bdi(inode)->wb.b_dirty_time));
+
/*
* Don't do this for I_DIRTY_PAGES - that doesn't actually
* dirty the inode itself
@@ -2165,7 +2256,7 @@ static void __writeback_inodes_sb_nr(str
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));

- bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
+ bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy, NULL);
wb_wait_for_completion(bdi, &done);
}

@@ -2257,6 +2348,10 @@ void sync_inodes_sb(struct super_block *
.for_sync = 1,
};
struct backing_dev_info *bdi = sb->s_bdi;
+ static DEFINE_MUTEX(dbg_mutex);
+ static struct split_work_dbg dbg;
+ static DECLARE_BITMAP(tmp_bitmap, 8192);
+ struct inode *inode;

/*
* Can't skip on !bdi_has_dirty() because we should wait for !dirty
@@ -2267,9 +2362,56 @@ void sync_inodes_sb(struct super_block *
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));

- bdi_split_work_to_wbs(bdi, &work, false);
+ mutex_lock(&dbg_mutex);
+
+ printk("XXX SYNCING %d:%d\n", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+
+ bitmap_zero(dbg.all_wbs, 8192);
+ bitmap_zero(dbg.iterated_wbs, 8192);
+ bitmap_zero(dbg.written_wbs, 8192);
+ bitmap_zero(dbg.sync_wbs, 8192);
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ spin_lock(&inode->i_lock);
+ inode->i_dbg_marker = inode_which_wb_io_list(inode, bdi);
+ spin_unlock(&inode->i_lock);
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+
+ bdi_split_work_to_wbs(bdi, &work, false, &dbg);
wb_wait_for_completion(bdi, &done);

+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ struct bdi_writeback *wb = inode->i_wb ?: &bdi->wb;
+
+ if (!inode->i_dbg_marker)
+ continue;
+
+ spin_lock_irq(&wb->list_lock);
+ if (inode->i_state & I_DIRTY_ALL)
+ printk_ratelimited("XXX sync_inodes_sb(%d:%d): dirty inode %lu skipped, wb=%d comp_gen=%d->%d which=%d->%d i_state=0x%lx\n",
+ MAJOR(sb->s_dev), MINOR(sb->s_dev), inode->i_ino,
+ wb->memcg_css->id, wb->last_comp_gen, wb->comp_gen,
+ inode->i_dbg_marker, inode_which_wb_io_list(inode, bdi),
+ inode->i_state);
+ spin_unlock_irq(&wb->list_lock);
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+
+ bitmap_andnot(tmp_bitmap, dbg.all_wbs, dbg.iterated_wbs, 8192);
+ if (!bitmap_empty(tmp_bitmap, 8192))
+ printk("XXX sync_inodes_sb(%d:%d): iteration skipped %8192pbl\n",
+ MAJOR(sb->s_dev), MINOR(sb->s_dev), tmp_bitmap);
+
+ printk("XXX all_wbs = %8192pbl\n", dbg.all_wbs);
+ printk("XXX iterated_wbs = %8192pbl\n", dbg.iterated_wbs);
+ printk("XXX written_wbs = %8192pbl\n", dbg.written_wbs);
+ printk("XXX sync_wbs = %8192pbl\n", dbg.sync_wbs);
+
+ mutex_unlock(&dbg_mutex);
+
wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -183,6 +183,7 @@ int inode_init_always(struct super_block
#endif
inode->i_flctx = NULL;
this_cpu_inc(nr_inodes);
+ inode->i_dbg_marker = 0;

return 0;
out:
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -129,6 +129,8 @@ struct bdi_writeback {
struct rcu_head rcu;
};
#endif
+ int comp_gen;
+ int last_comp_gen;
};

struct backing_dev_info {
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -38,7 +38,25 @@ extern struct workqueue_struct *bdi_wq;

static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
- return test_bit(WB_has_dirty_io, &wb->state);
+ bool ret = test_bit(WB_has_dirty_io, &wb->state);
+ long tot_write_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+
+ if (!ret && (!list_empty(&wb->b_dirty) || !list_empty(&wb->b_io) ||
+ !list_empty(&wb->b_more_io))) {
+ const char *name = wb->bdi->dev ? dev_name(wb->bdi->dev) : "UNK";
+
+ pr_err("wb_has_dirty_io: ERR %s has_dirty=%d b_dirty=%d b_io=%d b_more_io=%d\n",
+ name, ret, !list_empty(&wb->b_dirty), !list_empty(&wb->b_io), !list_empty(&wb->b_more_io));
+ WARN_ON(1);
+ }
+ if (ret && !tot_write_bw) {
+ const char *name = wb->bdi->dev ? dev_name(wb->bdi->dev) : "UNK";
+
+ pr_err("wb_has_dirty_io: ERR %s has_dirty=%d but tot_write_bw=%ld\n",
+ name, ret, tot_write_bw);
+ WARN_ON(1);
+ }
+ return ret;
}

static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -677,6 +677,8 @@ struct inode {
#endif

void *i_private; /* fs or device private pointer */
+ unsigned i_dbg_marker;
+ unsigned i_dbg_marker2;
};

static inline int inode_unhashed(struct inode *inode)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -382,7 +382,7 @@ static void wb_exit(struct bdi_writeback
* protected. cgwb_release_wait is used to wait for the completion of cgwb
* releases from bdi destruction path.
*/
-static DEFINE_SPINLOCK(cgwb_lock);
+DEFINE_SPINLOCK(cgwb_lock);
static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);

/**
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/