Re: [PATCH 2/4] modify swap_map and add SWAP_HAS_CACHE flag.

From: Daisuke Nishimura
Date: Mon Jun 01 2009 - 03:10:35 EST


> BTW, I'm now testing(with swap-in/out and swap-on/off) [2/4] of this patch set.
I've not hit any critical BUG during this weekend in my test.

Tested-by: Daisuke Nishimura <nishimura@xxxxxxxxxxxxxxxxx>

Thanks,
Daisuke Nishimura.

On Thu, 28 May 2009 14:19:00 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
>
> This is a part of patches for fixing memcg's swap account leak. But, IMHO,
> not a bad patch even if no memcg.
>
> Now, reference to swap is counted by swap_map[], an array of unsigned short.
> There are 2 kinds of references to swap.
> - reference from swap entry
> - reference from swap cache
> Then,
> - If there is swap cache && swap's refcnt is 1, there is only swap cache.
> (*) swapcount(entry) == 1 && find_get_page(swapper_space, entry) != NULL
>
> This counting logic have worked well for a long time. But considering that
> we cannot know there is a _real_ reference or not by swap_map[], current usage
> of counter is not very good.
>
> This patch adds a flag SWAP_HAS_CACHE and recored information that a swap entry
> has a cache or not. This will remove -1 magic used in swapfile.c and be a help
> to avoid unnecessary find_get_page().
>
> Changelog: v1->v2
> - fixed swapcache_prepare()'s return code.
> - changed swap_duplicate() be void function.
> - fixed racy case in swapoff().
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> ---
> include/linux/swap.h | 14 ++-
> mm/swap_state.c | 5 +
> mm/swapfile.c | 203 ++++++++++++++++++++++++++++++++++++---------------
> 3 files changed, 160 insertions(+), 62 deletions(-)
>
> Index: new-trial-swapcount2/include/linux/swap.h
> ===================================================================
> --- new-trial-swapcount2.orig/include/linux/swap.h
> +++ new-trial-swapcount2/include/linux/swap.h
> @@ -129,9 +129,10 @@ enum {
>
> #define SWAP_CLUSTER_MAX 32
>
> -#define SWAP_MAP_MAX 0x7fff
> -#define SWAP_MAP_BAD 0x8000
> -
> +#define SWAP_MAP_MAX 0x7ffe
> +#define SWAP_MAP_BAD 0x7fff
> +#define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */
> +#define SWAP_COUNT_MASK (~SWAP_HAS_CACHE)
> /*
> * The in-memory structure used to track swap areas.
> */
> @@ -300,7 +301,7 @@ extern long total_swap_pages;
> extern void si_swapinfo(struct sysinfo *);
> extern swp_entry_t get_swap_page(void);
> extern swp_entry_t get_swap_page_of_type(int);
> -extern int swap_duplicate(swp_entry_t);
> +extern void swap_duplicate(swp_entry_t);
> extern int swapcache_prepare(swp_entry_t);
> extern int valid_swaphandles(swp_entry_t, unsigned long *);
> extern void swap_free(swp_entry_t);
> @@ -372,9 +373,12 @@ static inline void show_swap_cache_info(
> }
>
> #define free_swap_and_cache(swp) is_migration_entry(swp)
> -#define swap_duplicate(swp) is_migration_entry(swp)
> #define swapcache_prepare(swp) is_migration_entry(swp)
>
> +static inline void swap_duplicate(swp_entry_t swp)
> +{
> +}
> +
> static inline void swap_free(swp_entry_t swp)
> {
> }
> Index: new-trial-swapcount2/mm/swapfile.c
> ===================================================================
> --- new-trial-swapcount2.orig/mm/swapfile.c
> +++ new-trial-swapcount2/mm/swapfile.c
> @@ -53,6 +53,26 @@ static struct swap_info_struct swap_info
>
> static DEFINE_MUTEX(swapon_mutex);
>
> +/* For reference count accounting in swap_map */
> +static inline int swap_count(unsigned short ent)
> +{
> + return ent & SWAP_COUNT_MASK;
> +}
> +
> +static inline int swap_has_cache(unsigned short ent)
> +{
> + return ent & SWAP_HAS_CACHE;
> +}
> +
> +static inline unsigned short make_swap_count(int count, int has_cache)
> +{
> + unsigned short ret = count;
> +
> + if (has_cache)
> + return SWAP_HAS_CACHE | ret;
> + return ret;
> +}
> +
> /*
> * We need this because the bdev->unplug_fn can sleep and we cannot
> * hold swap_lock while calling the unplug_fn. And swap_lock
> @@ -167,7 +187,8 @@ static int wait_for_discard(void *word)
> #define SWAPFILE_CLUSTER 256
> #define LATENCY_LIMIT 256
>
> -static inline unsigned long scan_swap_map(struct swap_info_struct *si)
> +static inline unsigned long scan_swap_map(struct swap_info_struct *si,
> + int cache)
> {
> unsigned long offset;
> unsigned long scan_base;
> @@ -285,7 +306,10 @@ checks:
> si->lowest_bit = si->max;
> si->highest_bit = 0;
> }
> - si->swap_map[offset] = 1;
> + if (cache) /* at usual swap-out via vmscan.c */
> + si->swap_map[offset] = make_swap_count(0, 1);
> + else /* at suspend */
> + si->swap_map[offset] = make_swap_count(1, 0);
> si->cluster_next = offset + 1;
> si->flags -= SWP_SCANNING;
>
> @@ -401,7 +425,8 @@ swp_entry_t get_swap_page(void)
> continue;
>
> swap_list.next = next;
> - offset = scan_swap_map(si);
> + /* This is called for allocating swap entry for cache */
> + offset = scan_swap_map(si, 1);
> if (offset) {
> spin_unlock(&swap_lock);
> return swp_entry(type, offset);
> @@ -415,6 +440,7 @@ noswap:
> return (swp_entry_t) {0};
> }
>
> +/* The only caller of this function is now susupend routine */
> swp_entry_t get_swap_page_of_type(int type)
> {
> struct swap_info_struct *si;
> @@ -424,7 +450,8 @@ swp_entry_t get_swap_page_of_type(int ty
> si = swap_info + type;
> if (si->flags & SWP_WRITEOK) {
> nr_swap_pages--;
> - offset = scan_swap_map(si);
> + /* This is called for allocating swap entry, not cache */
> + offset = scan_swap_map(si, 0);
> if (offset) {
> spin_unlock(&swap_lock);
> return swp_entry(type, offset);
> @@ -471,25 +498,36 @@ out:
> return NULL;
> }
>
> -static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
> +static int swap_entry_free(struct swap_info_struct *p,
> + swp_entry_t ent, int cache)
> {
> unsigned long offset = swp_offset(ent);
> - int count = p->swap_map[offset];
> + int count = swap_count(p->swap_map[offset]);
> + int has_cache = swap_has_cache(p->swap_map[offset]);
>
> - if (count < SWAP_MAP_MAX) {
> - count--;
> - p->swap_map[offset] = count;
> - if (!count) {
> - if (offset < p->lowest_bit)
> - p->lowest_bit = offset;
> - if (offset > p->highest_bit)
> - p->highest_bit = offset;
> - if (p->prio > swap_info[swap_list.next].prio)
> - swap_list.next = p - swap_info;
> - nr_swap_pages++;
> - p->inuse_pages--;
> - mem_cgroup_uncharge_swap(ent);
> - }
> + if (!cache) { /* dropping usage count of swap */
> + if (count < SWAP_MAP_MAX) {
> + count--;
> + p->swap_map[offset] = make_swap_count(count, has_cache);
> + }
> + } else { /* dropping swap cache flag */
> + VM_BUG_ON(!has_cache);
> + p->swap_map[offset] = make_swap_count(count, 0);
> +
> + }
> + /* return code. */
> + count = p->swap_map[offset];
> + /* free if no reference */
> + if (!count) {
> + if (offset < p->lowest_bit)
> + p->lowest_bit = offset;
> + if (offset > p->highest_bit)
> + p->highest_bit = offset;
> + if (p->prio > swap_info[swap_list.next].prio)
> + swap_list.next = p - swap_info;
> + nr_swap_pages++;
> + p->inuse_pages--;
> + mem_cgroup_uncharge_swap(ent);
> }
> return count;
> }
> @@ -504,7 +542,7 @@ void swap_free(swp_entry_t entry)
>
> p = swap_info_get(entry);
> if (p) {
> - swap_entry_free(p, entry);
> + swap_entry_free(p, entry, 0);
> spin_unlock(&swap_lock);
> }
> }
> @@ -514,9 +552,16 @@ void swap_free(swp_entry_t entry)
> */
> void swapcache_free(swp_entry_t entry, struct page *page)
> {
> + struct swap_info_struct *p;
> +
> if (page)
> mem_cgroup_uncharge_swapcache(page, entry);
> - return swap_free(entry);
> + p = swap_info_get(entry);
> + if (p) {
> + swap_entry_free(p, entry, 1);
> + spin_unlock(&swap_lock);
> + }
> + return;
> }
>
> /*
> @@ -531,8 +576,7 @@ static inline int page_swapcount(struct
> entry.val = page_private(page);
> p = swap_info_get(entry);
> if (p) {
> - /* Subtract the 1 for the swap cache itself */
> - count = p->swap_map[swp_offset(entry)] - 1;
> + count = swap_count(p->swap_map[swp_offset(entry)]);
> spin_unlock(&swap_lock);
> }
> return count;
> @@ -594,7 +638,7 @@ int free_swap_and_cache(swp_entry_t entr
>
> p = swap_info_get(entry);
> if (p) {
> - if (swap_entry_free(p, entry) == 1) {
> + if (swap_entry_free(p, entry, 0) == SWAP_HAS_CACHE) {
> page = find_get_page(&swapper_space, entry.val);
> if (page && !trylock_page(page)) {
> page_cache_release(page);
> @@ -901,7 +945,7 @@ static unsigned int find_next_to_unuse(s
> i = 1;
> }
> count = si->swap_map[i];
> - if (count && count != SWAP_MAP_BAD)
> + if (count && swap_count(count) != SWAP_MAP_BAD)
> break;
> }
> return i;
> @@ -1005,13 +1049,13 @@ static int try_to_unuse(unsigned int typ
> */
> shmem = 0;
> swcount = *swap_map;
> - if (swcount > 1) {
> + if (swap_count(swcount)) {
> if (start_mm == &init_mm)
> shmem = shmem_unuse(entry, page);
> else
> retval = unuse_mm(start_mm, entry, page);
> }
> - if (*swap_map > 1) {
> + if (swap_count(*swap_map)) {
> int set_start_mm = (*swap_map >= swcount);
> struct list_head *p = &start_mm->mmlist;
> struct mm_struct *new_start_mm = start_mm;
> @@ -1021,7 +1065,7 @@ static int try_to_unuse(unsigned int typ
> atomic_inc(&new_start_mm->mm_users);
> atomic_inc(&prev_mm->mm_users);
> spin_lock(&mmlist_lock);
> - while (*swap_map > 1 && !retval && !shmem &&
> + while (swap_count(*swap_map) && !retval && !shmem &&
> (p = p->next) != &start_mm->mmlist) {
> mm = list_entry(p, struct mm_struct, mmlist);
> if (!atomic_inc_not_zero(&mm->mm_users))
> @@ -1033,14 +1077,16 @@ static int try_to_unuse(unsigned int typ
> cond_resched();
>
> swcount = *swap_map;
> - if (swcount <= 1)
> + if (!swap_count(swcount)) /* any usage ? */
> ;
> else if (mm == &init_mm) {
> set_start_mm = 1;
> shmem = shmem_unuse(entry, page);
> } else
> retval = unuse_mm(mm, entry, page);
> - if (set_start_mm && *swap_map < swcount) {
> +
> + if (set_start_mm &&
> + swap_count(*swap_map) < swcount) {
> mmput(new_start_mm);
> atomic_inc(&mm->mm_users);
> new_start_mm = mm;
> @@ -1067,21 +1113,25 @@ static int try_to_unuse(unsigned int typ
> }
>
> /*
> - * How could swap count reach 0x7fff when the maximum
> - * pid is 0x7fff, and there's no way to repeat a swap
> - * page within an mm (except in shmem, where it's the
> - * shared object which takes the reference count)?
> - * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
> - *
> + * How could swap count reach 0x7ffe ?
> + * There's no way to repeat a swap page within an mm
> + * (except in shmem, where it's the shared object which takes
> + * the reference count)?
> + * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
> + * short is too small....)
> * If that's wrong, then we should worry more about
> * exit_mmap() and do_munmap() cases described above:
> * we might be resetting SWAP_MAP_MAX too early here.
> * We know "Undead"s can happen, they're okay, so don't
> * report them; but do report if we reset SWAP_MAP_MAX.
> */
> - if (*swap_map == SWAP_MAP_MAX) {
> + /* We might release the lock_page() in unuse_mm(). */
> + if (!PageSwapCache(page) || page_private(page) != entry.val)
> + goto retry;
> +
> + if (swap_count(*swap_map) == SWAP_MAP_MAX) {
> spin_lock(&swap_lock);
> - *swap_map = 1;
> + *swap_map = make_swap_count(0, 1);
> spin_unlock(&swap_lock);
> reset_overflow = 1;
> }
> @@ -1099,7 +1149,8 @@ static int try_to_unuse(unsigned int typ
> * pages would be incorrect if swap supported "shared
> * private" pages, but they are handled by tmpfs files.
> */
> - if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
> + if (swap_count(*swap_map) &&
> + PageDirty(page) && PageSwapCache(page)) {
> struct writeback_control wbc = {
> .sync_mode = WB_SYNC_NONE,
> };
> @@ -1126,6 +1177,7 @@ static int try_to_unuse(unsigned int typ
> * mark page dirty so shrink_page_list will preserve it.
> */
> SetPageDirty(page);
> +retry:
> unlock_page(page);
> page_cache_release(page);
>
> @@ -1952,15 +2004,22 @@ void si_swapinfo(struct sysinfo *val)
> *
> * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
> * "permanent", but will be reclaimed by the next swapoff.
> + * Returns error code in following case.
> + * - success -> 0
> + * - swp_entry is invalid -> EINVAL
> + * - swp_entry is migration entry -> EINVAL
> + * - swap-cache reference is requested but there is already one. -> EEXIST
> + * - swap-cache reference is requested but the entry is not used. -> ENOENT
> */
> -int swap_duplicate(swp_entry_t entry)
> +static int __swap_duplicate(swp_entry_t entry, int cache)
> {
> struct swap_info_struct * p;
> unsigned long offset, type;
> - int result = 0;
> + int result = -EINVAL;
> + int count, has_cache;
>
> if (is_migration_entry(entry))
> - return 1;
> + return -EINVAL;
>
> type = swp_type(entry);
> if (type >= nr_swapfiles)
> @@ -1969,17 +2028,37 @@ int swap_duplicate(swp_entry_t entry)
> offset = swp_offset(entry);
>
> spin_lock(&swap_lock);
> - if (offset < p->max && p->swap_map[offset]) {
> - if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
> - p->swap_map[offset]++;
> - result = 1;
> - } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
> +
> + if (unlikely(offset >= p->max))
> + goto unlock_out;
> +
> + count = swap_count(p->swap_map[offset]);
> + has_cache = swap_has_cache(p->swap_map[offset]);
> + if (cache) { /* called for swapcache/swapin-readahead */
> + /* set SWAP_HAS_CACHE if there is no cache and entry is used */
> + if (!has_cache && count) {
> + p->swap_map[offset] = make_swap_count(count, 1);
> + result = 0;
> + } else if (has_cache)
> + result = -EEXIST;
> + else if (!count)
> + result = -ENOENT;
> + } else if (count || has_cache) {
> + if (count < SWAP_MAP_MAX - 1) {
> + p->swap_map[offset] = make_swap_count(count + 1,
> + has_cache);
> + result = 0;
> + } else if (count <= SWAP_MAP_MAX) {
> if (swap_overflow++ < 5)
> - printk(KERN_WARNING "swap_dup: swap entry overflow\n");
> - p->swap_map[offset] = SWAP_MAP_MAX;
> - result = 1;
> - }
> - }
> + printk(KERN_WARNING
> + "swap_dup: swap entry overflow\n");
> + p->swap_map[offset] = make_swap_count(SWAP_MAP_MAX,
> + has_cache);
> + result = 0;
> + }
> + } else
> + result = -ENOENT; /* unused swap entry */
> +unlock_out:
> spin_unlock(&swap_lock);
> out:
> return result;
> @@ -1988,13 +2067,25 @@ bad_file:
> printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
> goto out;
> }
> +/*
> + * increase reference count of swap entry by 1.
> + */
> +void swap_duplicate(swp_entry_t entry)
> +{
> + __swap_duplicate(entry, 0);
> +}
>
> /*
> + * @entry: swap entry for which we allocate swap cache.
> + *
> * Called when allocating swap cache for exising swap entry,
> + * This can return error codes. Returns 0 at success.
> + * -EBUSY means there is a swap cache.
> + * Note: return code is different from swap_duplicate().
> */
> int swapcache_prepare(swp_entry_t entry)
> {
> - return swap_duplicate(entry);
> + return __swap_duplicate(entry, 1);
> }
>
>
> @@ -2035,7 +2126,7 @@ int valid_swaphandles(swp_entry_t entry,
> /* Don't read in free or bad pages */
> if (!si->swap_map[toff])
> break;
> - if (si->swap_map[toff] == SWAP_MAP_BAD)
> + if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
> break;
> }
> /* Count contiguous allocated slots below our target */
> @@ -2043,7 +2134,7 @@ int valid_swaphandles(swp_entry_t entry,
> /* Don't read in free or bad pages */
> if (!si->swap_map[toff])
> break;
> - if (si->swap_map[toff] == SWAP_MAP_BAD)
> + if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
> break;
> }
> spin_unlock(&swap_lock);
> Index: new-trial-swapcount2/mm/swap_state.c
> ===================================================================
> --- new-trial-swapcount2.orig/mm/swap_state.c
> +++ new-trial-swapcount2/mm/swap_state.c
> @@ -292,7 +292,10 @@ struct page *read_swap_cache_async(swp_e
> /*
> * Swap entry may have been freed since our caller observed it.
> */
> - if (!swapcache_prepare(entry))
> + err = swapcache_prepare(entry);
> + if (err == -EEXIST) /* seems racy */
> + continue;
> + if (err) /* swp entry is obsolete ? */
> break;
>
> /*
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/