Re: [RFC PATCH 1/3] mm: make persistent huge zero folio read-only

From: Dev Jain

Date: Wed May 27 2026 - 09:39:03 EST




On 27/05/26 9:26 am, Xueyuan chen wrote:
> From: Xueyuan Chen <xueyuan.chen21@xxxxxxxxx>
>
> The huge zero folio is shared globally, and its contents should never
> change after initialization. As Jann Horn pointed out[1], the kernel has
> had bugs, including security bugs, where read-only pages were later written
> to. If the huge zero folio is read-only in the direct map, such writes
> fault instead of silently corrupting shared zero contents.
>
> For the persistent huge zero folio, set this up once after the folio is
> allocated at boot.
>
> The permission change is best-effort. If the architecture cannot safely
> make the direct map read-only, keep using the writable persistent huge zero
> folio.
>
> While at it, mark the huge_zero_folio pointer itself __ro_after_init.
> READONLY_HUGE_ZERO_FOLIO depends on PERSISTENT_HUGE_ZERO_FOLIO, so the
> pointer is initialized during boot and never replaced.
>
> This was inspired by Jann Horn's read-only zero page work[1] and follow-up
> discussion[2] with Yang Shi.
>
> [1] https://lore.kernel.org/linux-mm/20260508-ro-zeropage-v1-1-9808abc20b49@xxxxxxxxxx/
> [2] https://lore.kernel.org/linux-mm/CAHbLzkrXXe7r3n3jXgDKtwZhRqj=jDx9E6dLOULohnhBguvi9A@xxxxxxxxxxxxxx/
>
> Co-developed-by: Lance Yang <lance.yang@xxxxxxxxx>
> Signed-off-by: Lance Yang <lance.yang@xxxxxxxxx>
> Signed-off-by: Xueyuan Chen <xueyuan.chen21@xxxxxxxxx>
> ---
> include/linux/huge_mm.h | 5 +++++
> mm/Kconfig | 17 +++++++++++++++++
> mm/huge_memory.c | 25 ++++++++++++++++++++++++-
> 3 files changed, 46 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index edece3e26985..45d1352619d1 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -5,6 +5,7 @@
> #include <linux/mm_types.h>
>
> #include <linux/fs.h> /* only for vma_is_dax() */
> +#include <linux/init.h>
> #include <linux/kobject.h>
>
> vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
> @@ -554,6 +555,10 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
> struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
> void mm_put_huge_zero_folio(struct mm_struct *mm);
>
> +#ifdef CONFIG_READONLY_HUGE_ZERO_FOLIO
> +bool __init arch_make_huge_zero_folio_readonly(struct folio *folio);
> +#endif
> +
> static inline struct folio *get_persistent_huge_zero_folio(void)
> {
> if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 776b67c66e82..f31200816646 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -787,6 +787,23 @@ config PERSISTENT_HUGE_ZERO_FOLIO
> Say Y if your system has lots of memory. Say N if you are
> memory constrained.
>
> +config ARCH_HAS_READONLY_HUGE_ZERO_FOLIO
> + bool
> +
> +config READONLY_HUGE_ZERO_FOLIO
> + bool "Map the huge zero folio read-only in the direct map"
> + depends on PERSISTENT_HUGE_ZERO_FOLIO
> + depends on ARCH_HAS_READONLY_HUGE_ZERO_FOLIO
> + help
> + The persistent huge zero folio is shared globally, and nothing
> + should ever change its contents after initialization.
> +
> + When supported, mark the folio read-only in the direct map so such
> + writes trigger a fault instead of silently corrupting the zero contents.
> +
> + If the permission change is not supported, the kernel keeps using
> + the writable persistent huge zero folio.
> +
> config MM_ID
> def_bool n
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index bf9b480bb3b0..c568755dd58e 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -75,7 +75,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> static bool split_underused_thp = true;
>
> static atomic_t huge_zero_refcount;
> +#ifdef CONFIG_READONLY_HUGE_ZERO_FOLIO
> +struct folio *huge_zero_folio __ro_after_init;


Can we guard this with CONFIG_PERSISTENT_HUGE_ZERO_FOLIO? Since
in that case too the pointer is never written after init.

> +#else
> struct folio *huge_zero_folio __read_mostly;
> +#endif
> unsigned long huge_zero_pfn __read_mostly = ~0UL;
> unsigned long huge_anon_orders_always __read_mostly;
> unsigned long huge_anon_orders_madvise __read_mostly;
> @@ -305,6 +309,18 @@ static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
> return 0;
> }
>
> +#ifdef CONFIG_READONLY_HUGE_ZERO_FOLIO
> +static bool __init make_huge_zero_folio_readonly(void)
> +{
> + return arch_make_huge_zero_folio_readonly(READ_ONCE(huge_zero_folio));


I think READ_ONCE is not required since no one is going to change the
pointer after allocation?


> +}
> +#else
> +static bool __init make_huge_zero_folio_readonly(void)
> +{
> + return false;
> +}
> +#endif
> +
> static struct shrinker *huge_zero_folio_shrinker;
>
> #ifdef CONFIG_SYSFS
> @@ -965,8 +981,15 @@ static int __init thp_shrinker_init(void)
> * that get_huge_zero_folio() will most likely not fail as
> * thp_shrinker_init() is invoked early on during boot.
> */
> - if (!get_huge_zero_folio())
> + if (!get_huge_zero_folio()) {
> pr_warn("Allocating persistent huge zero folio failed\n");
> + return 0;
> + }
> +
> + if (IS_ENABLED(CONFIG_READONLY_HUGE_ZERO_FOLIO) &&
> + !make_huge_zero_folio_readonly())
> + pr_warn("Making persistent huge zero folio read-only failed\n");
> +
> return 0;
> }
>