Re: [PATCH] Use MPOL_INTERLEAVE for tmpfs files

From: Brent Casavant
Date: Mon Nov 08 2004 - 15:02:17 EST


On Wed, 3 Nov 2004, Martin J. Bligh wrote:

> Matt has volunteered to write the mount option for this, so let's hold
> off for a couple of days until that's done.

I had the time to do this myself. Updated patch attached below.

Description:

This patch creates a tmpfs mount option and adds code which causes
tmpfs allocations to be interleaved across the nodes of a NUMA machine.
This is useful in situations where a large tmpfs file would otherwise
consume most of the memory on a single node, forcing tasks running on
that node to perform off-node allocations for other (i.e. non-tmpfs)
memory needs.

Tightly synchronized HPC applications with large (on the order of
a single node's total RAM) per-task memory requirements are an example
of a type of application which benefits from this change. Other
types of workloads may prefer local tmpfs allocations, thus a mount
option is provided.

The new mount option is "interleave=", and defaults to 0. Any non-zero
setting turns on interleaving. Interleaving behavior can be changed
via a remount operation.

Signed-off-by: Brent Casavant <bcasavan@xxxxxxx>

Index: linux/mm/mempolicy.c
===================================================================
--- linux.orig/mm/mempolicy.c 2004-11-03 10:24:16.000000000 -0600
+++ linux/mm/mempolicy.c 2004-11-03 10:26:30.000000000 -0600
@@ -1027,6 +1027,28 @@
return 0;
}

+void mpol_shared_policy_init(struct shared_policy *info, unsigned interleave)
+{
+ info->root = RB_ROOT;
+ init_MUTEX(&info->sem);
+
+ if (unlikely(interleave)) {
+ struct mempolicy *newpol;
+
+ /* Falls back to MPOL_DEFAULT on any error */
+ newpol = mpol_new(MPOL_INTERLEAVE, nodes_addr(node_online_map));
+ if (likely(!IS_ERR(newpol))) {
+ /* Create pseudo-vma that contains just the policy */
+ struct vm_area_struct pvma;
+
+ memset(&pvma, 0, sizeof(struct vm_area_struct));
+ /* Policy covers entire file */
+ pvma.vm_end = ~0UL;
+ mpol_set_shared_policy(info, &pvma, newpol);
+ }
+ }
+}
+
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma, struct mempolicy *npol)
{
Index: linux/fs/hugetlbfs/inode.c
===================================================================
--- linux.orig/fs/hugetlbfs/inode.c 2004-11-03 10:24:16.000000000 -0600
+++ linux/fs/hugetlbfs/inode.c 2004-11-03 10:26:30.000000000 -0600
@@ -384,7 +384,7 @@
inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
info = HUGETLBFS_I(inode);
- mpol_shared_policy_init(&info->policy);
+ mpol_shared_policy_init(&info->policy, 0);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
Index: linux/include/linux/mempolicy.h
===================================================================
--- linux.orig/include/linux/mempolicy.h 2004-11-03 10:24:16.000000000 -0600
+++ linux/include/linux/mempolicy.h 2004-11-03 10:26:30.000000000 -0600
@@ -137,11 +137,7 @@
struct semaphore sem;
};

-static inline void mpol_shared_policy_init(struct shared_policy *info)
-{
- info->root = RB_ROOT;
- init_MUTEX(&info->sem);
-}
+void mpol_shared_policy_init(struct shared_policy *info, unsigned interleave);

int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma,
@@ -198,7 +194,8 @@
return -EINVAL;
}

-static inline void mpol_shared_policy_init(struct shared_policy *info)
+static inline void mpol_shared_policy_init(struct shared_policy *info,
+ unsigned interleave)
{
}

Index: linux/mm/shmem.c
===================================================================
--- linux.orig/mm/shmem.c 2004-11-03 10:24:16.000000000 -0600
+++ linux/mm/shmem.c 2004-11-05 17:41:46.000000000 -0600
@@ -66,6 +66,9 @@
#define SHMEM_PAGEIN VM_READ
#define SHMEM_TRUNCATE VM_WRITE

+/* sbinfo flags */
+#define SHMEM_INTERLEAVE 0x1 /* Interleave tmpfs allocations */
+
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

@@ -1236,7 +1239,8 @@
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
- mpol_shared_policy_init(&info->policy);
+ mpol_shared_policy_init(&info->policy,
+ sbinfo && (sbinfo->flags & SHMEM_INTERLEAVE) ? 1 : 0);
INIT_LIST_HEAD(&info->swaplist);

switch (mode & S_IFMT) {
@@ -1784,7 +1788,9 @@
#endif
};

-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid,
+ gid_t *gid, unsigned long *blocks,
+ unsigned long *inodes, int *interleave)
{
char *this_char, *value, *rest;

@@ -1838,6 +1844,12 @@
*gid = simple_strtoul(value,&rest,0);
if (*rest)
goto bad_val;
+ } else if (!strcmp(this_char,"interleave")) {
+ if (!interleave)
+ continue;
+ *interleave = (0 != simple_strtoul(value,&rest,0));
+ if (*rest)
+ goto bad_val;
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
this_char);
@@ -1858,12 +1870,14 @@
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
unsigned long max_blocks = 0;
unsigned long max_inodes = 0;
+ int interleave = 0;

if (sbinfo) {
max_blocks = sbinfo->max_blocks;
max_inodes = sbinfo->max_inodes;
}
- if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
+ if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
+ &max_inodes, &interleave))
return -EINVAL;
/* Keep it simple: disallow limited <-> unlimited remount */
if ((max_blocks || max_inodes) == !sbinfo)
@@ -1871,6 +1885,12 @@
/* But allow the pointless unlimited -> unlimited remount */
if (!sbinfo)
return 0;
+ /* Allow change of interleaving */
+ if (interleave)
+ sbinfo->flags |= SHMEM_INTERLEAVE;
+ else
+ sbinfo->flags &= ~SHMEM_INTERLEAVE;
+
return shmem_set_size(sbinfo, max_blocks, max_inodes);
}
#endif
@@ -1900,6 +1920,7 @@
#ifdef CONFIG_TMPFS
unsigned long blocks = 0;
unsigned long inodes = 0;
+ int interleave = 0;

/*
* Per default we only allow half of the physical ram per
@@ -1912,12 +1933,12 @@
if (inodes > blocks)
inodes = blocks;

- if (shmem_parse_options(data, &mode,
- &uid, &gid, &blocks, &inodes))
+ if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
+ &inodes, &interleave))
return -EINVAL;
}

- if (blocks || inodes) {
+ if (blocks || inodes || interleave) {
struct shmem_sb_info *sbinfo;
sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
if (!sbinfo)
@@ -1928,6 +1949,9 @@
sbinfo->free_blocks = blocks;
sbinfo->max_inodes = inodes;
sbinfo->free_inodes = inodes;
+ sbinfo->flags = 0;
+ if (interleave)
+ sbinfo->flags |= SHMEM_INTERLEAVE;
}
sb->s_xattr = shmem_xattr_handlers;
#endif
Index: linux/include/linux/shmem_fs.h
===================================================================
--- linux.orig/include/linux/shmem_fs.h 2004-10-18 16:54:55.000000000 -0500
+++ linux/include/linux/shmem_fs.h 2004-11-05 17:35:25.000000000 -0600
@@ -26,6 +26,7 @@
unsigned long free_blocks; /* How many are left for allocation */
unsigned long max_inodes; /* How many inodes are allowed */
unsigned long free_inodes; /* How many are left for allocation */
+ unsigned long flags;
spinlock_t stat_lock;
};


--
Brent Casavant If you had nothing to fear,
bcasavan@xxxxxxx how then could you be brave?
Silicon Graphics, Inc. -- Queen Dama, Source Wars
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/