mm/filemap.c | 157 ++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 111 insertions(+), 46 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 385759c4ce4b..e44873da5d19 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1000,8 +1000,16 @@ struct wait_page_queue { wait_queue_entry_t wait; }; +/* + * Wake function return value: + * -1: "stop" + * 0: "go on". + * 1: we handled an exclusive case + */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { + int ret; + struct task_struct *target; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); @@ -1013,18 +1021,46 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, if (wait_page->bit_nr != key->bit_nr) return 0; + /* Stop walking if it's locked */ + if (wait->flags & WQ_FLAG_EXCLUSIVE) { + if (test_and_set_bit(key->bit_nr, &key->page->flags)) + return -1; + ret = 1; + } else { + if (test_bit(key->bit_nr, &key->page->flags)) + return -1; + ret = 0; + } + + /* - * Stop walking if it's locked. - * Is this safe if put_and_wait_on_page_locked() is in use? - * Yes: the waker must hold a reference to this page, and if PG_locked - * has now already been set by another task, that task must also hold - * a reference to the *same usage* of this page; so there is no need - * to walk on to wake even the put_and_wait_on_page_locked() callers. + * We can no longer use 'wait' after we've done the + * list_del_init(&wait->entry), because at that point + * the target may decide it's all done with no + * other locking, and 'wait' has been allocated on + * the stack of the target. */ - if (test_bit(key->bit_nr, &key->page->flags)) - return -1; + target = wait->private; + smp_mb(); - return autoremove_wake_function(wait, mode, sync, key); + /* + * Ok, we have successfully done what we're waiting for. + * + * Now unconditionally remove the wait entry, so that the + * waiter can use that to see success or not. + * + * We _really_ should have a "list_del_init_careful()" + * to properly pair with an unlocked "list_empty_careful()". + */ + list_del_init(&wait->entry); + + /* + * Theres's another memory barrier in the wakup path, that + * makes sure the wakup happens after the above is visible + * to the target. + */ + wake_up_state(target, mode); + return ret; } static void wake_up_page_bit(struct page *page, int bit_nr) @@ -1054,6 +1090,15 @@ static void wake_up_page_bit(struct page *page, int bit_nr) * from wait queue */ spin_unlock_irqrestore(&q->lock, flags); + + /* + * If somebody else set the bit again, stop waking + * people up. It's now the responsibility of that + * other page bit owner to do so. + */ + if (test_bit(bit_nr, &page->flags)) + return; + cpu_relax(); spin_lock_irqsave(&q->lock, flags); __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); @@ -1103,16 +1148,23 @@ enum behavior { */ }; +static inline int trylock_page_bit_common(struct page *page, int bit_nr, + enum behavior behavior) +{ + return behavior == EXCLUSIVE ? + !test_and_set_bit(bit_nr, &page->flags) : + !test_bit(bit_nr, &page->flags); +} + static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, enum behavior behavior) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; - bool bit_is_set; bool thrashing = false; bool delayacct = false; unsigned long pflags; - int ret = 0; + int ret; if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) { @@ -1130,52 +1182,65 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, wait_page.page = page; wait_page.bit_nr = bit_nr; - for (;;) { - spin_lock_irq(&q->lock); + /* + * Add ourselves to the wait queue. + * + * NOTE! This is where we also check the page + * state synchronously the last time to see that + * somebody didn't just clear the bit. + */ + spin_lock_irq(&q->lock); + SetPageWaiters(page); + if (!trylock_page_bit_common(page, bit_nr, behavior)) + __add_wait_queue_entry_tail(q, wait); + spin_unlock_irq(&q->lock); - if (likely(list_empty(&wait->entry))) { - __add_wait_queue_entry_tail(q, wait); - SetPageWaiters(page); - } + /* + * From now on, all the logic will be based on + * whether the wait entry is on the queue or not, + * and the page bit testing (and setting) will be + * done by the wake function, not us. + * + * We can drop our reference to the page. + */ + if (behavior == DROP) + put_page(page); + for (;;) { set_current_state(state); - spin_unlock_irq(&q->lock); - - bit_is_set = test_bit(bit_nr, &page->flags); - if (behavior == DROP) - put_page(page); + if (signal_pending_state(state, current)) + break; - if (likely(bit_is_set)) - io_schedule(); + if (list_empty_careful(&wait->entry)) + break; - if (behavior == EXCLUSIVE) { - if (!test_and_set_bit_lock(bit_nr, &page->flags)) - break; - } else if (behavior == SHARED) { - if (!test_bit(bit_nr, &page->flags)) - break; - } + io_schedule(); + } - if (signal_pending_state(state, current)) { + /* + * We're open-coding finish_wait(), the same way + * we open-coded add_wait() above. Because we care + * deeply about whether we needed to remove our + * wait entry or not (that means we failed). + * + * Note that we try to avoid taking the spinlock + * for the wait queue if at all possible. If we + * slept and were woken up, that will have removed + * our entry from the queue, and we don't need to + * do anythign else. + */ + __set_current_state(TASK_RUNNING); + ret = 0; + if (!list_empty_careful(&wait->entry)) { + spin_lock_irq(&q->lock); + if (!list_empty(&wait->entry)) { + list_del(&wait->entry); ret = -EINTR; - break; - } - - if (behavior == DROP) { - /* - * We can no longer safely access page->flags: - * even if CONFIG_MEMORY_HOTREMOVE is not enabled, - * there is a risk of waiting forever on a page reused - * for something that keeps it locked indefinitely. - * But best check for -EINTR above before breaking. - */ - break; } + spin_unlock_irq(&q->lock); } - finish_wait(q, wait); - if (thrashing) { if (delayacct) delayacct_thrashing_end();