Re: CONFIG_VMAP_STACK, on-stack struct, and wake_up_bit

From: Linus Torvalds
Date: Wed Oct 26 2016 - 13:15:38 EST


On Wed, Oct 26, 2016 at 9:32 AM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> Quite frankly, I think the solution is to just rip out all the insane
> zone crap.

IOW, something like the attached.

Advantage:

- just look at the number of garbage lines removed! 21
insertions(+), 182 deletions(-)

- it will actually speed up even the current case for all common
situations: no idiotic extra indirections that will take extra cache
misses

- because the bit_wait_table array is now denser (256 entries is
about 6kB of data on 64-bit with no spinlock debugging, so ~100
cachelines), maybe it gets fewer cache misses too

- we know how to handle the page_waitqueue contention issue, and it
has nothing to do with the stupid NUMA zones

The only case you actually get real page wait activity is IO, and I
suspect that hashing it out over ~100 cachelines will be more than
sufficient to avoid excessive contention, plus it's a cache-miss vs an
IO, so nobody sane cares.

The only reason it did that insane per-zone thing in the first place
that right now we access those wait-queues even when we damn well
shouldn't, and we have the solution for that.

Guys, holler if you hate this, but I think it's realistically the only
sane solution to the "wait queue on stack" issue.

Oh, and the patch is obviously entirely untested. I wouldn't want to
ruin my reputation by *testing* the patches I send out. What would be
the fun in that?

Linus
include/linux/mmzone.h | 30 +------------
kernel/sched/core.c | 16 +++++++
kernel/sched/wait.c | 10 -----
mm/filemap.c | 4 +-
mm/memory_hotplug.c | 28 ------------
mm/page_alloc.c | 115 +------------------------------------------------
6 files changed, 21 insertions(+), 182 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7f2ae99e5daf..0f088f3a2fed 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -440,33 +440,7 @@ struct zone {
seqlock_t span_seqlock;
#endif

- /*
- * wait_table -- the array holding the hash table
- * wait_table_hash_nr_entries -- the size of the hash table array
- * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
- *
- * The purpose of all these is to keep track of the people
- * waiting for a page to become available and make them
- * runnable again when possible. The trouble is that this
- * consumes a lot of space, especially when so few things
- * wait on pages at a given time. So instead of using
- * per-page waitqueues, we use a waitqueue hash table.
- *
- * The bucket discipline is to sleep on the same queue when
- * colliding and wake all in that wait queue when removing.
- * When something wakes, it must check to be sure its page is
- * truly available, a la thundering herd. The cost of a
- * collision is great, but given the expected load of the
- * table, they should be so rare as to be outweighed by the
- * benefits from the saved space.
- *
- * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
- * primary users of these fields, and in mm/page_alloc.c
- * free_area_init_core() performs the initialization of them.
- */
- wait_queue_head_t *wait_table;
- unsigned long wait_table_hash_nr_entries;
- unsigned long wait_table_bits;
+ int initialized;

/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
@@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)

static inline bool zone_is_initialized(struct zone *zone)
{
- return !!zone->wait_table;
+ return zone->initialized;
}

static inline bool zone_is_empty(struct zone *zone)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..42d4027f9e26 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly;
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);

+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
void __init sched_init(void)
{
int i, j;
unsigned long alloc_size = 0, ptr;

+ for (i = 0; i < WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(bit_wait_table + i);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 4f7053579fe3..9453efe9b25a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit)
}
EXPORT_SYMBOL(wake_up_bit);

-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
- const int shift = BITS_PER_LONG == 32 ? 5 : 6;
- const struct zone *zone = page_zone(virt_to_page(word));
- unsigned long val = (unsigned long)word << shift | bit;
-
- return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
-}
-EXPORT_SYMBOL(bit_waitqueue);
-
/*
* Manipulate the atomic_t address to produce a better bit waitqueue table hash
* index (we're keying off bit -1, but that would produce a horrible hash
diff --git a/mm/filemap.c b/mm/filemap.c
index 849f459ad078..c7fe2f16503f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc);
*/
wait_queue_head_t *page_waitqueue(struct page *page)
{
- const struct zone *zone = page_zone(page);
-
- return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
+ return bit_waitqueue(page, 0);
}
EXPORT_SYMBOL(page_waitqueue);

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 962927309b6e..b18dab401be6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
unsigned long i, pfn, end_pfn, nr_pages;
int node = pgdat->node_id;
struct page *page;
- struct zone *zone;

nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
page = virt_to_page(pgdat);
@@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
for (i = 0; i < nr_pages; i++, page++)
get_page_bootmem(node, page, NODE_INFO);

- zone = &pgdat->node_zones[0];
- for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
- if (zone_is_initialized(zone)) {
- nr_pages = zone->wait_table_hash_nr_entries
- * sizeof(wait_queue_head_t);
- nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
- page = virt_to_page(zone->wait_table);
-
- for (i = 0; i < nr_pages; i++, page++)
- get_page_bootmem(node, page, NODE_INFO);
- }
- }
-
pfn = pgdat->node_start_pfn;
end_pfn = pgdat_end_pfn(pgdat);

@@ -2158,20 +2144,6 @@ void try_offline_node(int nid)
*/
node_set_offline(nid);
unregister_one_node(nid);
-
- /* free waittable in each zone */
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- /*
- * wait_table may be allocated from boot memory,
- * here only free if it's allocated by vmalloc.
- */
- if (is_vmalloc_addr(zone->wait_table)) {
- vfree(zone->wait_table);
- zone->wait_table = NULL;
- }
- }
}
EXPORT_SYMBOL(try_offline_node);

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2b3bf6767d54..de7c6e43b1c9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4977,72 +4977,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
}

/*
- * Helper functions to size the waitqueue hash table.
- * Essentially these want to choose hash table sizes sufficiently
- * large so that collisions trying to wait on pages are rare.
- * But in fact, the number of active page waitqueues on typical
- * systems is ridiculously low, less than 200. So this is even
- * conservative, even though it seems large.
- *
- * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
- * waitqueues, i.e. the size of the waitq table given the number of pages.
- */
-#define PAGES_PER_WAITQUEUE 256
-
-#ifndef CONFIG_MEMORY_HOTPLUG
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
- unsigned long size = 1;
-
- pages /= PAGES_PER_WAITQUEUE;
-
- while (size < pages)
- size <<= 1;
-
- /*
- * Once we have dozens or even hundreds of threads sleeping
- * on IO we've got bigger problems than wait queue collision.
- * Limit the size of the wait table to a reasonable size.
- */
- size = min(size, 4096UL);
-
- return max(size, 4UL);
-}
-#else
-/*
- * A zone's size might be changed by hot-add, so it is not possible to determine
- * a suitable size for its wait_table. So we use the maximum size now.
- *
- * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
- *
- * i386 (preemption config) : 4096 x 16 = 64Kbyte.
- * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
- * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
- *
- * The maximum entries are prepared when a zone's memory is (512K + 256) pages
- * or more by the traditional way. (See above). It equals:
- *
- * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
- * ia64(16K page size) : = ( 8G + 4M)byte.
- * powerpc (64K page size) : = (32G +16M)byte.
- */
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
- return 4096UL;
-}
-#endif
-
-/*
- * This is an integer logarithm so that shifts can be used later
- * to extract the more random high bits from the multiplicative
- * hash function before the remainder is taken.
- */
-static inline unsigned long wait_table_bits(unsigned long size)
-{
- return ffz(~size);
-}
-
-/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
@@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void)
alloc_percpu(struct per_cpu_nodestat);
}

-static noinline __ref
-int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
-{
- int i;
- size_t alloc_size;
-
- /*
- * The per-page waitqueue mechanism uses hashed waitqueues
- * per zone.
- */
- zone->wait_table_hash_nr_entries =
- wait_table_hash_nr_entries(zone_size_pages);
- zone->wait_table_bits =
- wait_table_bits(zone->wait_table_hash_nr_entries);
- alloc_size = zone->wait_table_hash_nr_entries
- * sizeof(wait_queue_head_t);
-
- if (!slab_is_available()) {
- zone->wait_table = (wait_queue_head_t *)
- memblock_virt_alloc_node_nopanic(
- alloc_size, zone->zone_pgdat->node_id);
- } else {
- /*
- * This case means that a zone whose size was 0 gets new memory
- * via memory hot-add.
- * But it may be the case that a new node was hot-added. In
- * this case vmalloc() will not be able to use this new node's
- * memory - this wait_table must be initialized to use this new
- * node itself as well.
- * To use this new node's memory, further consideration will be
- * necessary.
- */
- zone->wait_table = vmalloc(alloc_size);
- }
- if (!zone->wait_table)
- return -ENOMEM;
-
- for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
- init_waitqueue_head(zone->wait_table + i);
-
- return 0;
-}
-
static __meminit void zone_pcp_init(struct zone *zone)
{
/*
@@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
- int ret;
- ret = zone_wait_table_init(zone, size);
- if (ret)
- return ret;
+
pgdat->nr_zones = zone_idx(zone) + 1;

zone->zone_start_pfn = zone_start_pfn;
@@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
zone_start_pfn, (zone_start_pfn + size));

zone_init_free_lists(zone);
+ zone->initialized = 1;

return 0;
}