Re: [PATCH] mm: remove global locks from mm/highmem.c

From: Christoph Hellwig
Date: Sun Jan 28 2007 - 09:49:52 EST


On Sun, Jan 28, 2007 at 03:11:34PM +0100, Peter Zijlstra wrote:
> Eradicate global locks.
>
> - kmap_lock is removed by extensive use of atomic_t, a new flush
> scheme and modifying set_page_address to only allow NULL<->virt
> transitions.

What's the point for this? Extensive atomic_t use is usually much worse
than spinlocks. A spinlock region is just a single atomic instruction,
as soon as you do more than one atomic_t you tend to make scalability
worse. Not to mention that atomic_t are much worse when you try to
profile scalability issues.

What benchmark shows a problem with the current locking, and from what
caller? In doubt we just need to convert that caller to kmap_atomic.


>
> A count of 0 is an exclusive state acting as an entry lock. This is done
> using inc_not_zero and cmpxchg. The restriction on changing the virtual
> address closes the gap with concurrent additions of the same entry.
>
> - pool_lock is removed by using the pkmap index for the
> page_address_maps.
>
> By using the pkmap index for the hash entries it is no longer needed to
> keep a free list.
>
> This patch has been in -rt for a while but should also help regular
> highmem machines with multiple cores/cpus.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> ---
> include/linux/mm.h | 32 ++-
> mm/highmem.c | 433 ++++++++++++++++++++++++++++++-----------------------
> 2 files changed, 276 insertions(+), 189 deletions(-)
>
> Index: linux/include/linux/mm.h
> ===================================================================
> --- linux.orig/include/linux/mm.h
> +++ linux/include/linux/mm.h
> @@ -543,23 +543,39 @@ static __always_inline void *lowmem_page
> #endif
>
> #if defined(WANT_PAGE_VIRTUAL)
> -#define page_address(page) ((page)->virtual)
> -#define set_page_address(page, address) \
> - do { \
> - (page)->virtual = (address); \
> - } while(0)
> -#define page_address_init() do { } while(0)
> +/*
> + * wrap page->virtual so it is safe to set/read locklessly
> + */
> +#define page_address(page) \
> + ({ typeof((page)->virtual) v = (page)->virtual; \
> + smp_read_barrier_depends(); \
> + v; })
> +
> +static inline int set_page_address(struct page *page, void *address)
> +{
> + if (address)
> + return cmpxchg(&page->virtual, NULL, address) == NULL;
> + else {
> + /*
> + * cmpxchg is a bit abused because it is not guaranteed
> + * safe wrt direct assignment on all platforms.
> + */
> + void *virt = page->virtual;
> + return cmpxchg(&page->vitrual, virt, NULL) == virt;
> + }
> +}
> +void page_address_init(void);
> #endif
>
> #if defined(HASHED_PAGE_VIRTUAL)
> void *page_address(struct page *page);
> -void set_page_address(struct page *page, void *virtual);
> +int set_page_address(struct page *page, void *virtual);
> void page_address_init(void);
> #endif
>
> #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
> #define page_address(page) lowmem_page_address(page)
> -#define set_page_address(page, address) do { } while(0)
> +#define set_page_address(page, address) (0)
> #define page_address_init() do { } while(0)
> #endif
>
> Index: linux/mm/highmem.c
> ===================================================================
> --- linux.orig/mm/highmem.c
> +++ linux/mm/highmem.c
> @@ -14,6 +14,11 @@
> * based on Linus' idea.
> *
> * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx>
> + *
> + * Largely rewritten to get rid of all global locks
> + *
> + * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <pzijlstr@xxxxxxxxxx>
> + *
> */
>
> #include <linux/mm.h>
> @@ -27,18 +32,14 @@
> #include <linux/hash.h>
> #include <linux/highmem.h>
> #include <linux/blktrace_api.h>
> +
> #include <asm/tlbflush.h>
> +#include <asm/pgtable.h>
>
> -/*
> - * Virtual_count is not a pure "count".
> - * 0 means that it is not mapped, and has not been mapped
> - * since a TLB flush - it is usable.
> - * 1 means that there are no users, but it has been mapped
> - * since the last TLB flush - so we can't use it.
> - * n means that there are (n-1) current users of it.
> - */
> #ifdef CONFIG_HIGHMEM
>
> +static int __set_page_address(struct page *page, void *virtual, int pos);
> +
> unsigned long totalhigh_pages __read_mostly;
>
> unsigned int nr_free_highpages (void)
> @@ -52,164 +53,208 @@ unsigned int nr_free_highpages (void)
> return pages;
> }
>
> -static int pkmap_count[LAST_PKMAP];
> -static unsigned int last_pkmap_nr;
> -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
> +/*
> + * count is not a pure "count".
> + * 0 means its owned exclusively by someone
> + * 1 means its free for use - either mapped or not.
> + * n means that there are (n-1) current users of it.
> + */
> +static atomic_t pkmap_count[LAST_PKMAP];
> +static atomic_t pkmap_hand;
>
> pte_t * pkmap_page_table;
>
> static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
>
> -static void flush_all_zero_pkmaps(void)
> +/*
> + * Try to free a given kmap slot.
> + *
> + * Returns:
> + * -1 - in use
> + * 0 - free, no TLB flush needed
> + * 1 - free, needs TLB flush
> + */
> +static int pkmap_try_free(int pos)
> {
> - int i;
> -
> - flush_cache_kmaps();
> + if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1)
> + return -1;
>
> - for (i = 0; i < LAST_PKMAP; i++) {
> - struct page *page;
> + /*
> + * TODO: add a young bit to make it CLOCK
> + */
> + if (!pte_none(pkmap_page_table[pos])) {
> + struct page *page = pte_page(pkmap_page_table[pos]);
> + unsigned long addr = PKMAP_ADDR(pos);
> + pte_t *ptep = &pkmap_page_table[pos];
> +
> + VM_BUG_ON(addr != (unsigned long)page_address(page));
> +
> + if (!__set_page_address(page, NULL, pos))
> + BUG();
> + flush_kernel_dcache_page(page);
> + pte_clear(&init_mm, addr, ptep);
>
> - /*
> - * zero means we don't have anything to do,
> - * >1 means that it is still in use. Only
> - * a count of 1 means that it is free but
> - * needs to be unmapped
> - */
> - if (pkmap_count[i] != 1)
> - continue;
> - pkmap_count[i] = 0;
> + return 1;
> + }
>
> - /* sanity check */
> - BUG_ON(pte_none(pkmap_page_table[i]));
> + return 0;
> +}
>
> - /*
> - * Don't need an atomic fetch-and-clear op here;
> - * no-one has the page mapped, and cannot get at
> - * its virtual address (and hence PTE) without first
> - * getting the kmap_lock (which is held here).
> - * So no dangers, even with speculative execution.
> - */
> - page = pte_page(pkmap_page_table[i]);
> - pte_clear(&init_mm, (unsigned long)page_address(page),
> - &pkmap_page_table[i]);
> +static inline void pkmap_put(atomic_t *counter)
> +{
> + switch (atomic_dec_return(counter)) {
> + case 0:
> + BUG();
>
> - set_page_address(page, NULL);
> + case 1:
> + wake_up(&pkmap_map_wait);
> }
> - flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
> }
>
> -static inline unsigned long map_new_virtual(struct page *page)
> +#define TLB_BATCH 32
> +
> +static int pkmap_get_free(void)
> {
> - unsigned long vaddr;
> - int count;
> + int i, pos, flush;
> + DECLARE_WAITQUEUE(wait, current);
>
> -start:
> - count = LAST_PKMAP;
> - /* Find an empty entry */
> - for (;;) {
> - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
> - if (!last_pkmap_nr) {
> - flush_all_zero_pkmaps();
> - count = LAST_PKMAP;
> - }
> - if (!pkmap_count[last_pkmap_nr])
> - break; /* Found a usable entry */
> - if (--count)
> - continue;
> +restart:
> + for (i = 0; i < LAST_PKMAP; i++) {
> + pos = atomic_inc_return(&pkmap_hand) % LAST_PKMAP;
> + flush = pkmap_try_free(pos);
> + if (flush >= 0)
> + goto got_one;
> + }
> +
> + /*
> + * wait for somebody else to unmap their entries
> + */
> + __set_current_state(TASK_UNINTERRUPTIBLE);
> + add_wait_queue(&pkmap_map_wait, &wait);
> + schedule();
> + remove_wait_queue(&pkmap_map_wait, &wait);
> +
> + goto restart;
> +
> +got_one:
> + if (flush) {
> +#if 0
> + flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1));
> +#else
> + int pos2 = (pos + 1) % LAST_PKMAP;
> + int nr;
> + int entries[TLB_BATCH];
>
> /*
> - * Sleep for somebody else to unmap their entries
> + * For those architectures that cannot help but flush the
> + * whole TLB, flush some more entries to make it worthwhile.
> + * Scan ahead of the hand to minimise search distances.
> */
> - {
> - DECLARE_WAITQUEUE(wait, current);
> + for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH;
> + i++, pos2 = (pos2 + 1) % LAST_PKMAP) {
>
> - __set_current_state(TASK_UNINTERRUPTIBLE);
> - add_wait_queue(&pkmap_map_wait, &wait);
> - spin_unlock(&kmap_lock);
> - schedule();
> - remove_wait_queue(&pkmap_map_wait, &wait);
> - spin_lock(&kmap_lock);
> -
> - /* Somebody else might have mapped it while we slept */
> - if (page_address(page))
> - return (unsigned long)page_address(page);
> + flush = pkmap_try_free(pos2);
> + if (flush < 0)
> + continue;
> +
> + if (!flush) {
> + atomic_t *counter = &pkmap_count[pos2];
> + VM_BUG_ON(atomic_read(counter) != 0);
> + atomic_set(counter, 2);
> + pkmap_put(counter);
> + } else
> + entries[nr++] = pos2;
> + }
> + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
>
> - /* Re-start */
> - goto start;
> + for (i = 0; i < nr; i++) {
> + atomic_t *counter = &pkmap_count[entries[i]];
> + VM_BUG_ON(atomic_read(counter) != 0);
> + atomic_set(counter, 2);
> + pkmap_put(counter);
> }
> +#endif
> }
> - vaddr = PKMAP_ADDR(last_pkmap_nr);
> - set_pte_at(&init_mm, vaddr,
> - &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
> + return pos;
> +}
> +
> +static unsigned long pkmap_insert(struct page *page)
> +{
> + int pos = pkmap_get_free();
> + unsigned long vaddr = PKMAP_ADDR(pos);
> + pte_t *ptep = &pkmap_page_table[pos];
> + pte_t entry = mk_pte(page, kmap_prot);
> + atomic_t *counter = &pkmap_count[pos];
> +
> + VM_BUG_ON(atomic_read(counter) != 0);
>
> - pkmap_count[last_pkmap_nr] = 1;
> - set_page_address(page, (void *)vaddr);
> + set_pte_at(&init_mm, vaddr, ptep, entry);
> + if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) {
> + /*
> + * concurrent pkmap_inserts for this page -
> + * the other won the race, release this entry.
> + *
> + * we can still clear the pte without a tlb flush since
> + * it couldn't have been used yet.
> + */
> + pte_clear(&init_mm, vaddr, ptep);
> + VM_BUG_ON(atomic_read(counter) != 0);
> + atomic_set(counter, 2);
> + pkmap_put(counter);
> + vaddr = 0;
> + } else
> + atomic_set(counter, 2);
>
> return vaddr;
> }
>
> -void fastcall *kmap_high(struct page *page)
> +fastcall void *kmap_high(struct page *page)
> {
> unsigned long vaddr;
> -
> - /*
> - * For highmem pages, we can't trust "virtual" until
> - * after we have the lock.
> - *
> - * We cannot call this from interrupts, as it may block
> - */
> - spin_lock(&kmap_lock);
> +again:
> vaddr = (unsigned long)page_address(page);
> + if (vaddr) {
> + atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)];
> + if (atomic_inc_not_zero(counter)) {
> + /*
> + * atomic_inc_not_zero implies a (memory) barrier on success
> + * so page address will be reloaded.
> + */
> + unsigned long vaddr2 = (unsigned long)page_address(page);
> + if (likely(vaddr == vaddr2))
> + return (void *)vaddr;
> +
> + /*
> + * Oops, we got someone else.
> + *
> + * This can happen if we get preempted after
> + * page_address() and before atomic_inc_not_zero()
> + * and during that preemption this slot is freed and
> + * reused.
> + */
> + pkmap_put(counter);
> + goto again;
> + }
> + }
> +
> + vaddr = pkmap_insert(page);
> if (!vaddr)
> - vaddr = map_new_virtual(page);
> - pkmap_count[PKMAP_NR(vaddr)]++;
> - BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
> - spin_unlock(&kmap_lock);
> - return (void*) vaddr;
> + goto again;
> +
> + return (void *)vaddr;
> }
>
> EXPORT_SYMBOL(kmap_high);
>
> -void fastcall kunmap_high(struct page *page)
> +fastcall void kunmap_high(struct page *page)
> {
> - unsigned long vaddr;
> - unsigned long nr;
> - int need_wakeup;
> -
> - spin_lock(&kmap_lock);
> - vaddr = (unsigned long)page_address(page);
> + unsigned long vaddr = (unsigned long)page_address(page);
> BUG_ON(!vaddr);
> - nr = PKMAP_NR(vaddr);
> -
> - /*
> - * A count must never go down to zero
> - * without a TLB flush!
> - */
> - need_wakeup = 0;
> - switch (--pkmap_count[nr]) {
> - case 0:
> - BUG();
> - case 1:
> - /*
> - * Avoid an unnecessary wake_up() function call.
> - * The common case is pkmap_count[] == 1, but
> - * no waiters.
> - * The tasks queued in the wait-queue are guarded
> - * by both the lock in the wait-queue-head and by
> - * the kmap_lock. As the kmap_lock is held here,
> - * no need for the wait-queue-head's lock. Simply
> - * test if the queue is empty.
> - */
> - need_wakeup = waitqueue_active(&pkmap_map_wait);
> - }
> - spin_unlock(&kmap_lock);
> -
> - /* do wake-up, if needed, race-free outside of the spin lock */
> - if (need_wakeup)
> - wake_up(&pkmap_map_wait);
> + pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]);
> }
>
> EXPORT_SYMBOL(kunmap_high);
> +
> #endif
>
> #if defined(HASHED_PAGE_VIRTUAL)
> @@ -217,19 +262,13 @@ EXPORT_SYMBOL(kunmap_high);
> #define PA_HASH_ORDER 7
>
> /*
> - * Describes one page->virtual association
> + * Describes one page->virtual address association.
> */
> -struct page_address_map {
> +static struct page_address_map {
> struct page *page;
> void *virtual;
> struct list_head list;
> -};
> -
> -/*
> - * page_address_map freelist, allocated from page_address_maps.
> - */
> -static struct list_head page_address_pool; /* freelist */
> -static spinlock_t pool_lock; /* protects page_address_pool */
> +} page_address_maps[LAST_PKMAP];
>
> /*
> * Hash table bucket
> @@ -244,91 +283,123 @@ static struct page_address_slot *page_sl
> return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
> }
>
> -void *page_address(struct page *page)
> +static void *__page_address(struct page_address_slot *pas, struct page *page)
> {
> - unsigned long flags;
> - void *ret;
> - struct page_address_slot *pas;
> -
> - if (!PageHighMem(page))
> - return lowmem_page_address(page);
> + void *ret = NULL;
>
> - pas = page_slot(page);
> - ret = NULL;
> - spin_lock_irqsave(&pas->lock, flags);
> if (!list_empty(&pas->lh)) {
> struct page_address_map *pam;
>
> list_for_each_entry(pam, &pas->lh, list) {
> if (pam->page == page) {
> ret = pam->virtual;
> - goto done;
> + break;
> }
> }
> }
> -done:
> +
> + return ret;
> +}
> +
> +void *page_address(struct page *page)
> +{
> + unsigned long flags;
> + void *ret;
> + struct page_address_slot *pas;
> +
> + if (!PageHighMem(page))
> + return lowmem_page_address(page);
> +
> + pas = page_slot(page);
> + spin_lock_irqsave(&pas->lock, flags);
> + ret = __page_address(pas, page);
> spin_unlock_irqrestore(&pas->lock, flags);
> return ret;
> }
>
> EXPORT_SYMBOL(page_address);
>
> -void set_page_address(struct page *page, void *virtual)
> +static int __set_page_address(struct page *page, void *virtual, int pos)
> {
> + int ret = 0;
> unsigned long flags;
> struct page_address_slot *pas;
> struct page_address_map *pam;
>
> - BUG_ON(!PageHighMem(page));
> + VM_BUG_ON(!PageHighMem(page));
> + VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0);
> + VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP);
>
> pas = page_slot(page);
> - if (virtual) { /* Add */
> - BUG_ON(list_empty(&page_address_pool));
> + pam = &page_address_maps[pos];
>
> - spin_lock_irqsave(&pool_lock, flags);
> - pam = list_entry(page_address_pool.next,
> - struct page_address_map, list);
> - list_del(&pam->list);
> - spin_unlock_irqrestore(&pool_lock, flags);
> -
> - pam->page = page;
> - pam->virtual = virtual;
> -
> - spin_lock_irqsave(&pas->lock, flags);
> - list_add_tail(&pam->list, &pas->lh);
> - spin_unlock_irqrestore(&pas->lock, flags);
> - } else { /* Remove */
> - spin_lock_irqsave(&pas->lock, flags);
> - list_for_each_entry(pam, &pas->lh, list) {
> - if (pam->page == page) {
> - list_del(&pam->list);
> - spin_unlock_irqrestore(&pas->lock, flags);
> - spin_lock_irqsave(&pool_lock, flags);
> - list_add_tail(&pam->list, &page_address_pool);
> - spin_unlock_irqrestore(&pool_lock, flags);
> - goto done;
> - }
> + spin_lock_irqsave(&pas->lock, flags);
> + if (virtual) { /* add */
> + VM_BUG_ON(!list_empty(&pam->list));
> +
> + if (!__page_address(pas, page)) {
> + pam->page = page;
> + pam->virtual = virtual;
> + list_add_tail(&pam->list, &pas->lh);
> + ret = 1;
> + }
> + } else { /* remove */
> + if (!list_empty(&pam->list)) {
> + list_del_init(&pam->list);
> + ret = 1;
> }
> - spin_unlock_irqrestore(&pas->lock, flags);
> }
> -done:
> - return;
> + spin_unlock_irqrestore(&pas->lock, flags);
> +
> + return ret;
> }
>
> -static struct page_address_map page_address_maps[LAST_PKMAP];
> +int set_page_address(struct page *page, void *virtual)
> +{
> + /*
> + * set_page_address is not supposed to be called when using
> + * hashed virtual addresses.
> + */
> + BUG();
> + return 0;
> +}
>
> -void __init page_address_init(void)
> +void __init __page_address_init(void)
> {
> int i;
>
> - INIT_LIST_HEAD(&page_address_pool);
> for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
> - list_add(&page_address_maps[i].list, &page_address_pool);
> + INIT_LIST_HEAD(&page_address_maps[i].list);
> +
> for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
> INIT_LIST_HEAD(&page_address_htable[i].lh);
> spin_lock_init(&page_address_htable[i].lock);
> }
> - spin_lock_init(&pool_lock);
> +}
> +
> +#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */
> +
> +static int __set_page_address(struct page *page, void *virtual, int pos)
> +{
> + return set_page_address(page, virtual);
> }
>
> #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
> +
> +#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL)
> +
> +void __init page_address_init(void)
> +{
> +#ifdef CONFIG_HIGHMEM
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(pkmap_count); i++)
> + atomic_set(&pkmap_count[i], 1);
> +#endif
> +
> +#ifdef HASHED_PAGE_VIRTUAL
> + __page_address_init();
> +#endif
> +}
> +
> +#endif
>
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
---end quoted text---
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/