Re: [PATCH] mm: swap: determine swap device by using page nid
From: Michal Hocko
Date: Thu Apr 07 2022 - 03:52:19 EST
[Cc Aaron who has introduced the per node swap changes]
On Wed 06-04-22 19:09:53, Yang Shi wrote:
> The swap devices are linked to per node priority lists, the swap device
> closer to the node has higher priority on that node's priority list.
> This is supposed to improve I/O latency, particularly for some fast
> devices. But the current code gets nid by calling numa_node_id() which
> actually returns the nid that the reclaimer is running on instead of the
> nid that the page belongs to.
>
> Pass the page's nid dow to get_swap_pages() in order to pick up the
> right swap device. But it doesn't work for the swap slots cache which
> is per cpu. We could skip swap slots cache if the current node is not
> the page's node, but it may be overkilling. So keep using the current
> node's swap slots cache. The issue was found by visual code inspection
> so it is not sure how much improvement could be achieved due to lack of
> suitable testing device. But anyway the current code does violate the
> design.
Do you have any perf numbers for this change?
> Cc: Huang Ying <ying.huang@xxxxxxxxx>
> Signed-off-by: Yang Shi <shy828301@xxxxxxxxx>
> ---
> include/linux/swap.h | 3 ++-
> mm/swap_slots.c | 7 ++++---
> mm/swapfile.c | 5 ++---
> 3 files changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 27093b477c5f..e442cf6b61ea 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -497,7 +497,8 @@ extern void si_swapinfo(struct sysinfo *);
> extern swp_entry_t get_swap_page(struct page *page);
> extern void put_swap_page(struct page *page, swp_entry_t entry);
> extern swp_entry_t get_swap_page_of_type(int);
> -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
> +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size,
> + int node);
> extern int add_swap_count_continuation(swp_entry_t, gfp_t);
> extern void swap_shmem_alloc(swp_entry_t);
> extern int swap_duplicate(swp_entry_t);
> diff --git a/mm/swap_slots.c b/mm/swap_slots.c
> index 2b5531840583..a1c5cf6a4302 100644
> --- a/mm/swap_slots.c
> +++ b/mm/swap_slots.c
> @@ -264,7 +264,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
> cache->cur = 0;
> if (swap_slot_cache_active)
> cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
> - cache->slots, 1);
> + cache->slots, 1, numa_node_id());
>
> return cache->nr;
> }
> @@ -305,12 +305,13 @@ swp_entry_t get_swap_page(struct page *page)
> {
> swp_entry_t entry;
> struct swap_slots_cache *cache;
> + int nid = page_to_nid(page);
>
> entry.val = 0;
>
> if (PageTransHuge(page)) {
> if (IS_ENABLED(CONFIG_THP_SWAP))
> - get_swap_pages(1, &entry, HPAGE_PMD_NR);
> + get_swap_pages(1, &entry, HPAGE_PMD_NR, nid);
> goto out;
> }
>
> @@ -342,7 +343,7 @@ swp_entry_t get_swap_page(struct page *page)
> goto out;
> }
>
> - get_swap_pages(1, &entry, 1);
> + get_swap_pages(1, &entry, 1, nid);
> out:
> if (mem_cgroup_try_charge_swap(page, entry)) {
> put_swap_page(page, entry);
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 63c61f8b2611..151fffe0fd60 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1036,13 +1036,13 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
> swap_range_free(si, offset, SWAPFILE_CLUSTER);
> }
>
> -int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
> +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size,
> + int node)
> {
> unsigned long size = swap_entry_size(entry_size);
> struct swap_info_struct *si, *next;
> long avail_pgs;
> int n_ret = 0;
> - int node;
>
> /* Only single cluster request supported */
> WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
> @@ -1060,7 +1060,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
> atomic_long_sub(n_goal * size, &nr_swap_pages);
>
> start_over:
> - node = numa_node_id();
> plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
> /* requeue si to after same-priority siblings */
> plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
> --
> 2.26.3
--
Michal Hocko
SUSE Labs