Re: [PATCH 2/2] mm,memory_hotplug: {READ,WRITE}_ONCE unsynchronized zone data

From: Lance Yang
Date: Wed May 22 2024 - 00:25:57 EST


Hi Brendan,

On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@xxxxxxxxxx> wrote:
>
> These fields are written by memory hotplug under mem_hotplug_lock but
> read without any lock. It seems like reader code is robust against the
> value being stale or "from the future", but we also need to account
> for:
>
> 1. Load/store tearing (according to Linus[1], this really happens,
> even when everything is aligned as you would hope).
>
> 2. Invented loads[2] - the compiler can spill and re-read these fields
> ([2] calls this "invented loads") and assume that they have not
> changed.
>
> Note we don't need READ_ONCE in paths that have the mem_hotplug_lock
> for write, but we still need WRITE_ONCE to prevent store-tearing.
>
> [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@xxxxxxxxxxxxxx/T/#u
> As discovered via the original big-bad article[2]
> [2] https://lwn.net/Articles/793253/
>
> Signed-off-by: Brendan Jackman <jackmanb@xxxxxxxxxx>
> ---
> include/linux/mmzone.h | 14 ++++++++++----
> mm/compaction.c | 2 +-
> mm/memory_hotplug.c | 20 ++++++++++++--------
> mm/mm_init.c | 2 +-
> mm/page_alloc.c | 2 +-
> mm/show_mem.c | 8 ++++----
> mm/vmstat.c | 4 ++--
> 7 files changed, 31 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 194ef7fed9d6..bdb3be76d10c 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone)
> #endif
> }
>
> +/* This is unstable unless you hold mem_hotplug_lock. */
> static inline unsigned long zone_end_pfn(const struct zone *zone)
> {
> - return zone->zone_start_pfn + zone->spanned_pages;
> + return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages);
> }
>
> +/* This is unstable unless you hold mem_hotplug_lock. */
> static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
> {
> return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
> @@ -1033,9 +1035,10 @@ static inline bool zone_is_initialized(struct zone *zone)
> return zone->initialized;
> }
>
> +/* This is unstable unless you hold mem_hotplug_lock. */
> static inline bool zone_is_empty(struct zone *zone)
> {
> - return zone->spanned_pages == 0;
> + return READ_ONCE(zone->spanned_pages) == 0;
> }
>
> #ifndef BUILD_VDSO32_64
> @@ -1485,10 +1488,13 @@ static inline bool managed_zone(struct zone *zone)
> return zone_managed_pages(zone);
> }
>
> -/* Returns true if a zone has memory */
> +/*
> + * Returns true if a zone has memory.
> + * This is unstable unless you old mem_hotplug_lock.
> + */
> static inline bool populated_zone(struct zone *zone)
> {
> - return zone->present_pages;
> + return READ_ONCE(zone->present_pages);
> }
>
> #ifdef CONFIG_NUMA
> diff --git a/mm/compaction.c b/mm/compaction.c
> index e731d45befc7..b8066d1fdcf5 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -2239,7 +2239,7 @@ static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
> {
> unsigned long score;
>
> - score = zone->present_pages * fragmentation_score_zone(zone);
> + score = READ_ONCE(zone->present_pages) * fragmentation_score_zone(zone);
> return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
> }
>
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 431b1f6753c0..71b5e3d314a2 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -463,6 +463,8 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
> int nid = zone_to_nid(zone);
>
> if (zone->zone_start_pfn == start_pfn) {
> + unsigned long old_end_pfn = zone_end_pfn(zone);
> +
> /*
> * If the section is smallest section in the zone, it need
> * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
> @@ -470,13 +472,13 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
> * for shrinking zone.
> */
> pfn = find_smallest_section_pfn(nid, zone, end_pfn,
> - zone_end_pfn(zone));
> + old_end_pfn);
> if (pfn) {
> - zone->spanned_pages = zone_end_pfn(zone) - pfn;
> + WRITE_ONCE(zone->spanned_pages, old_end_pfn - pfn);
> zone->zone_start_pfn = pfn;
> } else {
> zone->zone_start_pfn = 0;
> - zone->spanned_pages = 0;
> + WRITE_ONCE(zone->spanned_pages, 0);
> }
> } else if (zone_end_pfn(zone) == end_pfn) {
> /*
> @@ -488,10 +490,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
> pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
> start_pfn);
> if (pfn)
> - zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
> + WRITE_ONCE(zone->spanned_pages,
> + pfn - zone->zone_start_pfn + 1);
> else {
> zone->zone_start_pfn = 0;
> - zone->spanned_pages = 0;
> + WRITE_ONCE(zone->spanned_pages, 0);
> }
> }
> }
> @@ -710,7 +713,8 @@ static void __meminit resize_zone_range(struct zone *zone, unsigned long start_p
> if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
> zone->zone_start_pfn = start_pfn;
>
> - zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
> + WRITE_ONCE(zone->spanned_pages,
> + max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn);
> }
>
> static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
> @@ -795,7 +799,7 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
> struct zone *zone)
> {
> if (zone_idx(zone) == ZONE_MOVABLE) {
> - stats->movable_pages += zone->present_pages;
> + stats->movable_pages += READ_ONCE(zone->present_pages);
> } else {
> stats->kernel_early_pages += zone->present_early_pages;
> #ifdef CONFIG_CMA
> @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
> */
> if (early_section(__pfn_to_section(page_to_pfn(page))))
> zone->present_early_pages += nr_pages;
> - zone->present_pages += nr_pages;
> + WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);

I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing
on 'zone->present_pages', but it's probably just me overthinking it :)

Thanks,
Lance

> zone->zone_pgdat->node_present_pages += nr_pages;
>
> if (group && movable)
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index c725618aeb58..ec66f2eadb95 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1540,7 +1540,7 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
> for (z = 0; z < MAX_NR_ZONES; z++) {
> struct zone *zone = pgdat->node_zones + z;
>
> - zone->present_pages = 0;
> + WRITE_ONCE(zone->present_pages, 0);
> zone_init_internals(zone, z, nid, 0);
> }
> }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 5116a2b9ea6e..1eb9000ec7d7 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5728,7 +5728,7 @@ __meminit void zone_pcp_init(struct zone *zone)
>
> if (populated_zone(zone))
> pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
> - zone->present_pages, zone_batchsize(zone));
> + READ_ONCE(zone->present_pages), zone_batchsize(zone));
> }
>
> void adjust_managed_page_count(struct page *page, long count)
> diff --git a/mm/show_mem.c b/mm/show_mem.c
> index bdb439551eef..667680a6107b 100644
> --- a/mm/show_mem.c
> +++ b/mm/show_mem.c
> @@ -337,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
> K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
> K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
> K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
> - K(zone->present_pages),
> + K(READ_ONCE(zone->present_pages)),
> K(zone_managed_pages(zone)),
> K(zone_page_state(zone, NR_MLOCK)),
> K(zone_page_state(zone, NR_BOUNCE)),
> @@ -407,11 +407,11 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>
> for_each_populated_zone(zone) {
>
> - total += zone->present_pages;
> - reserved += zone->present_pages - zone_managed_pages(zone);
> + total += READ_ONCE(zone->present_pages);
> + reserved += READ_ONCE(zone->present_pages) - zone_managed_pages(zone);
>
> if (is_highmem(zone))
> - highmem += zone->present_pages;
> + highmem += READ_ONCE(zone->present_pages);
> }
>
> printk("%lu pages RAM\n", total);
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 8507c497218b..5a9c4b5768e5 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1708,8 +1708,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
> min_wmark_pages(zone),
> low_wmark_pages(zone),
> high_wmark_pages(zone),
> - zone->spanned_pages,
> - zone->present_pages,
> + READ_ONCE(zone->spanned_pages),
> + READ_ONCE(zone->present_pages),
> zone_managed_pages(zone),
> zone_cma_pages(zone));
>
>
> --
> 2.45.0.rc1.225.g2a3ae87e7f-goog
>
>