Re: [PATCH v4] mm/hugetlb: add mempolicy check in the reservation routine

From: Muchun Song
Date: Thu Aug 06 2020 - 03:46:43 EST


On Tue, Jul 28, 2020 at 11:49 AM Muchun Song <songmuchun@xxxxxxxxxxxxx> wrote:
>
> In the reservation routine, we only check whether the cpuset meets
> the memory allocation requirements. But we ignore the mempolicy of
> MPOL_BIND case. If someone mmap hugetlb succeeds, but the subsequent
> memory allocation may fail due to mempolicy restrictions and receives
> the SIGBUS signal. This can be reproduced by the follow steps.
>
> 1) Compile the test case.
> cd tools/testing/selftests/vm/
> gcc map_hugetlb.c -o map_hugetlb
>
> 2) Pre-allocate huge pages. Suppose there are 2 numa nodes in the
> system. Each node will pre-allocate one huge page.
> echo 2 > /proc/sys/vm/nr_hugepages
>
> 3) Run test case(mmap 4MB). We receive the SIGBUS signal.
> numactl --membind=0 ./map_hugetlb 4
>
> With this patch applied, the mmap will fail in the step 3) and throw
> "mmap: Cannot allocate memory".
>
> Signed-off-by: Muchun Song <songmuchun@xxxxxxxxxxxxx>
> Reported-by: Jianchao Guo <guojianchao@xxxxxxxxxxxxx>
> Suggested-by: Michal Hocko <mhocko@xxxxxxxxxx>
> Reviewed-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>

Hi Andrew,

Any comments or forgot to add this to the queue for the
merge window?

> ---
> changelog in v4:
> 1) Fix compilation errors with !CONFIG_NUMA.
>
> changelog in v3:
> 1) Do not allocate nodemask on the stack.
> 2) Update comment.
>
> changelog in v2:
> 1) Reuse policy_nodemask().
>
> include/linux/mempolicy.h | 14 ++++++++++++++
> mm/hugetlb.c | 22 ++++++++++++++++++----
> mm/mempolicy.c | 2 +-
> 3 files changed, 33 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index ea9c15b60a96..0656ece1ccf1 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -152,6 +152,15 @@ extern int huge_node(struct vm_area_struct *vma,
> extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
> extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
> const nodemask_t *mask);
> +extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
> +
> +static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
> +{
> + struct mempolicy *mpol = get_task_policy(current);
> +
> + return policy_nodemask(gfp, mpol);
> +}
> +
> extern unsigned int mempolicy_slab_node(void);
>
> extern enum zone_type policy_zone;
> @@ -281,5 +290,10 @@ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
> static inline void mpol_put_task_policy(struct task_struct *task)
> {
> }
> +
> +static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
> +{
> + return NULL;
> +}
> #endif /* CONFIG_NUMA */
> #endif
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 589c330df4db..a34458f6a475 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -3463,13 +3463,21 @@ static int __init default_hugepagesz_setup(char *s)
> }
> __setup("default_hugepagesz=", default_hugepagesz_setup);
>
> -static unsigned int cpuset_mems_nr(unsigned int *array)
> +static unsigned int allowed_mems_nr(struct hstate *h)
> {
> int node;
> unsigned int nr = 0;
> + nodemask_t *mpol_allowed;
> + unsigned int *array = h->free_huge_pages_node;
> + gfp_t gfp_mask = htlb_alloc_mask(h);
> +
> + mpol_allowed = policy_nodemask_current(gfp_mask);
>
> - for_each_node_mask(node, cpuset_current_mems_allowed)
> - nr += array[node];
> + for_each_node_mask(node, cpuset_current_mems_allowed) {
> + if (!mpol_allowed ||
> + (mpol_allowed && node_isset(node, *mpol_allowed)))
> + nr += array[node];
> + }
>
> return nr;
> }
> @@ -3648,12 +3656,18 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
> * we fall back to check against current free page availability as
> * a best attempt and hopefully to minimize the impact of changing
> * semantics that cpuset has.
> + *
> + * Apart from cpuset, we also have memory policy mechanism that
> + * also determines from which node the kernel will allocate memory
> + * in a NUMA system. So similar to cpuset, we also should consider
> + * the memory policy of the current task. Similar to the description
> + * above.
> */
> if (delta > 0) {
> if (gather_surplus_pages(h, delta) < 0)
> goto out;
>
> - if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
> + if (delta > allowed_mems_nr(h)) {
> return_unused_surplus_pages(h, delta);
> goto out;
> }
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 93fcfc1f2fa2..fce14c3f4f38 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1873,7 +1873,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
> * Return a nodemask representing a mempolicy for filtering nodes for
> * page allocation
> */
> -static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
> +nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
> {
> /* Lower zones don't get a nodemask applied for MPOL_BIND */
> if (unlikely(policy->mode == MPOL_BIND) &&
> --
> 2.11.0
>


--
Yours,
Muchun