Re: regression caused by cgroups optimization in 3.17-rc2

From: Johannes Weiner
Date: Fri Sep 05 2014 - 10:47:46 EST


On Fri, Sep 05, 2014 at 11:25:37AM +0200, Michal Hocko wrote:
> @@ -900,10 +900,10 @@ void lru_add_drain_all(void)
> * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
> * will free it.
> */
> -void release_pages(struct page **pages, int nr, bool cold)
> +static void release_lru_pages(struct page **pages, int nr,
> + struct list_head *pages_to_free)
> {
> int i;
> - LIST_HEAD(pages_to_free);
> struct zone *zone = NULL;
> struct lruvec *lruvec;
> unsigned long uninitialized_var(flags);
> @@ -943,11 +943,26 @@ void release_pages(struct page **pages, int nr, bool cold)
> /* Clear Active bit in case of parallel mark_page_accessed */
> __ClearPageActive(page);
>
> - list_add(&page->lru, &pages_to_free);
> + list_add(&page->lru, pages_to_free);
> }
> if (zone)
> spin_unlock_irqrestore(&zone->lru_lock, flags);
> +}
> +/*
> + * Batched page_cache_release(). Frees and uncharges all given pages
> + * for which the reference count drops to 0.
> + */
> +void release_pages(struct page **pages, int nr, bool cold)
> +{
> + LIST_HEAD(pages_to_free);
>
> + while (nr) {
> + int batch = min(nr, PAGEVEC_SIZE);
> +
> + release_lru_pages(pages, batch, &pages_to_free);
> + pages += batch;
> + nr -= batch;
> + }

We might be able to process a lot more pages in one go if nobody else
needs the lock or the CPU. Can't we just cycle the lock or reschedule
if necessary?

diff --git a/mm/swap.c b/mm/swap.c
index 6b2dc3897cd5..ee0cf21dd521 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -944,6 +944,15 @@ void release_pages(struct page **pages, int nr, bool cold)
__ClearPageActive(page);

list_add(&page->lru, &pages_to_free);
+
+ if (should_resched() ||
+ (zone && spin_needbreak(&zone->lru_lock))) {
+ if (zone) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone = NULL;
+ }
+ cond_resched();
+ }
}
if (zone)
spin_unlock_irqrestore(&zone->lru_lock, flags);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3e0ec83d000c..c487ca4682a4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -262,19 +262,12 @@ void free_page_and_swap_cache(struct page *page)
*/
void free_pages_and_swap_cache(struct page **pages, int nr)
{
- struct page **pagep = pages;
+ int i;

lru_add_drain();
- while (nr) {
- int todo = min(nr, PAGEVEC_SIZE);
- int i;
-
- for (i = 0; i < todo; i++)
- free_swap_cache(pagep[i]);
- release_pages(pagep, todo, false);
- pagep += todo;
- nr -= todo;
- }
+ for (i = 0; i < nr; i++)
+ free_swap_cache(pages[i]);
+ release_pages(pages, nr, false);
}

/*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/