[RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate()

From: Mike Kravetz
Date: Thu Apr 16 2015 - 19:04:06 EST


This is based on the shmem version, but it has diverged quite
a bit. We have no swap to worry about, nor the new file sealing.

What this allows us to do is move physical memory in and out of
a hugetlbfs file without having it mapped. This also gives us
the ability to support MADV_REMOVE since it is currently
implemented using fallocate(). MADV_REMOVE lets us remove data
from the middle of a hugetlbfs file, which wasn't possible before.

hugetlbfs fallocate only operates on whole huge pages.

Based-on code-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
---
fs/hugetlbfs/inode.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/hugetlb.h | 3 ++
mm/hugetlb.c | 2 +-
3 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d5b67fd..6d48c8f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
@@ -377,6 +378,143 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart, loff_t lend)
hugetlb_unreserve_pages(inode, start, freed);
}

+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct hstate *h = hstate_inode(inode);
+ unsigned long hpage_size = huge_page_size(h);
+ loff_t hole_start, hole_end;
+
+ /*
+ * For hole punch round up the beginning offset of the hole and
+ * round down the end.
+ */
+ hole_start = (offset + hpage_size - 1) & ~huge_page_mask(h);
+ hole_end = (offset + len - (hpage_size - 1)) * ~huge_page_mask(h);
+
+ if ((u64)hole_end > (u64)hole_start) {
+ struct address_space *mapping = &inode->i_data;
+
+ mutex_lock(&inode->i_mutex);
+ unmap_mapping_range(mapping, hole_start, hole_end, 0);
+ truncate_hugepages(inode, hole_start, hole_end);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct hstate *h = hstate_inode(inode);
+ struct vm_area_struct pseudo_vma;
+ unsigned long hpage_size = huge_page_size(h);
+ unsigned long hpage_shift = huge_page_shift(h);
+ pgoff_t start, index, end;
+ unsigned long addr;
+ int error;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return hugetlbfs_punch_hole(inode, offset, len);
+
+ /*
+ * Default preallocate case.
+ * For this range, start is rounded down and end is rounded up.
+ */
+ start = offset >> hpage_shift;
+ end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+ mutex_lock(&inode->i_mutex);
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ /*
+ * Initialize a pseudo vma that just contains the policy used
+ * when allocating the huge pages. The actual policy field
+ * (vm_policy) is determined based on the index in the loop below.
+ */
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_start = 0;
+ pseudo_vma.vm_flags |= (VM_HUGETLB | VM_MAYSHARE);
+ pseudo_vma.vm_file = file;
+
+ /* addr is the offset within the file (zero based) */
+ addr = start * hpage_size;
+ for (index = start; index < end; index++) {
+ /*
+ * This is supposed to be the vaddr where the page is being
+ * faulted in, but we have no vaddr here.
+ */
+ struct page *page;
+ int avoid_reserve = 1;
+
+ cond_resched();
+
+ /*
+ * fallocate(2) manpage permits EINTR; we may have been
+ * interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current)) {
+ error = -EINTR;
+ break;
+ }
+ page = find_get_page(mapping, index);
+ if (page) {
+ put_page(page);
+ continue;
+ }
+
+ /* Get policy based on index */
+ pseudo_vma.vm_policy =
+ mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+ index);
+
+ page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ mpol_cond_put(pseudo_vma.vm_policy);
+ if (IS_ERR(page)) {
+ error = PTR_ERR(page);
+ goto out;
+ }
+ clear_huge_page(page, addr, pages_per_huge_page(h));
+ __SetPageUptodate(page);
+ error = huge_add_to_page_cache(page, mapping, index);
+ if (error) {
+ put_page(page);
+ /* Keep going if we see an -EEXIST */
+ if (error != -EEXIST)
+ goto out; /* FIXME, need to free? */
+ }
+
+ /*
+ * page_put due to reference from alloc_huge_page()
+ * unlock_page because locked by add_to_page_cache()
+ */
+ put_page(page);
+ unlock_page(page);
+
+ /* Increment addr for next huge page */
+ addr += hpage_size;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+
static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
@@ -743,6 +881,7 @@ const struct file_operations hugetlbfs_file_operations = {
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
.llseek = default_llseek,
+ .fallocate = hugetlbfs_fallocate,
};

static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6425945..d96b88e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -322,6 +322,8 @@ struct huge_bootmem_page {
#endif
};

+struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_node(struct hstate *h, int nid);
struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
@@ -476,6 +478,7 @@ static inline bool hugepages_supported(void)

#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
+#define alloc_huge_page(v, a, r) NULL
#define alloc_huge_page_node(h, nid) NULL
#define alloc_huge_page_noerr(v, a, r) NULL
#define alloc_bootmem_huge_page(h) NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7cda328..e130c6d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1363,7 +1363,7 @@ static void vma_commit_reservation(struct hstate *h,
region_add(resv, idx, idx + 1);
}

-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/