Re: [RFC 2/2] x86_64: expand kernel stack to 16K

From: Linus Torvalds
Date: Thu May 29 2014 - 20:05:26 EST


On Thu, May 29, 2014 at 4:36 PM, Minchan Kim <minchan@xxxxxxxxxx> wrote:
>
> I did below hacky test to apply your idea and the result is overflow again.
> So, again it would second stack expansion. Otherwise, we should prevent
> swapout in direct reclaim.

So changing io_schedule() is bad, for the reasons I outlined elsewhere
(we use it for wait_for_page*() - see sleep_on_page().

It's the congestion waiting where the io_schedule() should be avoided.

So maybe test a patch something like the attached.

NOTE! This is absolutely TOTALLY UNTESTED! It might do horrible
horrible things. It seems to compile, but I have absolutely no reason
to believe that it would work. I didn't actually test that this moves
anything at all to kblockd. So think of it as a concept patch that
*might* work, but as Dave said, there might also be other things that
cause unplugging and need some tough love.

Linus
mm/backing-dev.c | 28 ++++++++++++++++++----------
mm/vmscan.c | 4 +---
2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 09d9591b7708..cb26b24c2da2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -11,6 +11,7 @@
#include <linux/writeback.h>
#include <linux/device.h>
#include <trace/events/writeback.h>
+#include <linux/blkdev.h>

static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

@@ -573,6 +574,21 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
}
EXPORT_SYMBOL(set_bdi_congested);

+static long congestion_timeout(int sync, long timeout)
+{
+ long ret;
+ DEFINE_WAIT(wait);
+ struct blk_plug *plug = current->plug;
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ if (plug)
+ blk_flush_plug_list(plug, true);
+ ret = schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+ return ret;
+}
+
/**
* congestion_wait - wait for a backing_dev to become uncongested
* @sync: SYNC or ASYNC IO
@@ -586,12 +602,8 @@ long congestion_wait(int sync, long timeout)
{
long ret;
unsigned long start = jiffies;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[sync];

- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
+ ret = congestion_timeout(sync,timeout);

trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
jiffies_to_usecs(jiffies - start));
@@ -622,8 +634,6 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
{
long ret;
unsigned long start = jiffies;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[sync];

/*
* If there is no congestion, or heavy congestion is not being
@@ -643,9 +653,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
}

/* Sleep until uncongested or a write happens */
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
+ ret = congestion_timeout(sync, timeout);

out:
trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 32c661d66a45..1e524000b83e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -989,9 +989,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* avoid risk of stack overflow but only writeback
* if many dirty pages have been encountered.
*/
- if (page_is_file_cache(page) &&
- (!current_is_kswapd() ||
- !zone_is_reclaim_dirty(zone))) {
+ if (!current_is_kswapd() || !zone_is_reclaim_dirty(zone)) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()