Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c

From: Mike Kravetz
Date: Wed Sep 06 2023 - 20:19:14 EST


On 09/04/23 21:05, Sidhartha Kumar wrote:
> On 8/21/23 11:33 AM, Mike Kravetz wrote:
> > On 08/17/23 11:18, Sidhartha Kumar wrote:
> > > Remove special cased hugetlb handling code within the page cache by
> > > changing the granularity of each index to the base page size rather than
> > > the huge page size. Adds new wrappers for hugetlb code to to interact with the
> > > page cache which convert to a linear index.
> > <snip>
> > > @@ -237,7 +234,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio)
> > > if (free_folio)
> > > free_folio(folio);
> > > - if (folio_test_large(folio) && !folio_test_hugetlb(folio))
> > > + if (folio_test_large(folio))
> > > refs = folio_nr_pages(folio);
> > > folio_put_refs(folio, refs);
> > > }
> > > @@ -858,14 +855,15 @@ noinline int __filemap_add_folio(struct address_space *mapping,
> > > if (!huge) {
> > > int error = mem_cgroup_charge(folio, NULL, gfp);
> > > - VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
> > > if (error)
> > > return error;
> > > charged = true;
> > > - xas_set_order(&xas, index, folio_order(folio));
> > > - nr = folio_nr_pages(folio);
> > > }
> >
> > When a hugetlb page is added to the page cache, the ref count will now
> > be increased by folio_nr_pages. So, the ref count for a 2MB hugetlb page
> > on x86 will be increased by 512.
> >
> > We will need a corresponding change to migrate_huge_page_move_mapping().
> > For migration, the ref count is checked as follows:
> >
> > xas_lock_irq(&xas);
> > expected_count = 2 + folio_has_private(src);
> Hi Mike,
>
> Thanks for catching this. Changing this line to:
> + expected_count = folio_expected_refs(mapping, src);
> seems to fix migration from my testing. My test was inserting a sleep() in
> the hugepage-mmap.c selftest and running the migratepages command.
>
> With this version of the patch:
> migrate_pages(44906, 65, [0x0000000000000001], [0x0000000000000002]) = 75
> which means 75 pages did not migrate and after the change to
> folio_expected_refs():
> migrate_pages(7344, 65, [0x0000000000000001], [0x0000000000000002]) = 0
>
> Does that change look correct to you?

I just ran the simple attached test program (don't laugh) on the suggested
change. Command line './move-pages 2 /var/opt/oracle/hugepool/foo'.
Unfortunately, migration is not working as expected. The source pages of
the migration are not freed.

I have not taken a closer look at the code to get an idea about root cause.
Certainly, it has to do with the ref counts. I can look closer in a day or
two if you have not resolved the issue.
--
Mike Kravetz
/*
* hugepage-mmap:
*
* Example of using huge page memory in a user application using the mmap
* system call. Before running this application, make sure that the
* administrator has mounted the hugetlbfs filesystem (on some directory
* like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
* example, the app is requesting memory of size 256MB that is backed by
* huge pages.
*
* For the ia64 architecture, the Linux kernel reserves Region number 4 for
* huge pages. That means that if one requires a fixed address, a huge page
* aligned address starting with 0x800000... will be required. If a fixed
* address is not required, the kernel will select an address in the proper
* range.
* Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
*/

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#define __USE_GNU
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <time.h>
#include <numa.h>
#include <numaif.h>

#define USAGE "USAGE: %s num_hpages hugepagefile_name"
#define H_PAGESIZE (2 * 1024 * 1024)
#define B_PAGESIZE (4096)

#define ITERATIONS 100000

#define PROTECTION (PROT_READ | PROT_WRITE)
#define ADDR (void *)(0x0UL)
#define FLAGS (MAP_SHARED)

int main(int argc, char ** argv)
{
char *f_name;
char *sep;
char ch;
int fd;
long i;
long long hpages, bpages;
void *addr;
char foo;
long count = 0;
void **pages;
int *nodes;
int *status;
int flags;
long m_ret;
/*
* HARD CODED FOR TWO NODES: 0 and 1
*/
unsigned long node0_mask = 01L << 0;
unsigned long node1_mask = 01L << 1;

if (argc != 3) {
printf(USAGE, argv[0]);
exit (1);
}

hpages = strtol(argv[1], &sep, 0);
if (errno || hpages < 0) {
printf("Invalid number hpages (%s)\n", argv[1]);
printf(USAGE, argv[0]);
exit (1);
}
bpages = hpages * (H_PAGESIZE / B_PAGESIZE);

f_name = argv[2];
fd = open(f_name, O_CREAT | O_RDWR, 0755);
if (fd < 0) {
printf("Open of %s failed", argv[2]);
exit(1);
}

addr = mmap(ADDR, hpages * H_PAGESIZE, PROTECTION, FLAGS, fd, 0);
if (addr == MAP_FAILED) {
perror("mmap");
exit (1);
}
printf("%ld huge pages mapped at 0x%lx\n", hpages,
( unsigned long)addr);
printf("Faulting in all pages\n");
for (i=0; i < hpages; i++)
foo = *((char *)(addr + (i * H_PAGESIZE)));

pages = malloc(bpages * sizeof(void *));
nodes = malloc(bpages * sizeof(int));
status = malloc(bpages * sizeof(int));
if (!pages || !nodes || !status) {
printf("error allocating memory for arrays\n");
exit (1);
}

while (1) {
printf("Hit any key to move hugetlb pages to node 1\n");
read(STDIN_FILENO, &ch, 1);

for (i=0; i < hpages; i++) {
pages[i] = addr + (i * H_PAGESIZE);
// pages[i] = addr + (i * H_PAGESIZE) + B_PAGESIZE;
nodes[i] = 1;
status[i] = -1;
flags = MPOL_MF_MOVE_ALL;
}
m_ret = numa_move_pages(0, hpages, pages, nodes, status, flags);
if (m_ret) {
perror("move_pages");
if (m_ret > 0)
printf("%ld pages not migrated\n", m_ret);
} else {
printf("Success!\n");
}
for (i=0; i < hpages; i++) {
printf("\tstatus[%d] = %d\n", i, status[i]);
status[i] = -1;
}

printf("Hit any key to move hugetlb pages to node 0\n");
read(STDIN_FILENO, &ch, 1);
for (i=0; i < hpages; i++) {
pages[i] = addr + (i * H_PAGESIZE);
// pages[i] = addr + (i * H_PAGESIZE) + B_PAGESIZE;
nodes[i] = 0;
status[i] = -1;
flags = MPOL_MF_MOVE_ALL;
}
m_ret = numa_move_pages(0, hpages, pages, nodes, status, flags);
if (m_ret) {
perror("move_pages");
if (m_ret > 0)
printf("%ld pages not migrated\n", m_ret);
} else {
printf("Success!\n");
}
for (i=0; i < hpages; i++) {
printf("\tstatus[%d] = %d\n", i, status[i]);
status[i] = -1;
}
}

munmap(addr, hpages * H_PAGESIZE);
close(fd);

return 0;
}