Re: [PATCH] Use MPOL_INTERLEAVE for tmpfs files

From: Hugh Dickins
Date: Thu Nov 11 2004 - 14:56:28 EST


On Tue, 9 Nov 2004, Hugh Dickins wrote:
>
> Not to say that your patch or Adam's will go further (I've no objection
> to the way Adam is using tmpfs, but no opinion on the future of devfs),
> but they're two hints that I should rework that to get out of people's
> way. I'll do a patch for that, then another something like yours on
> top, for you to go back and check.
>
> I think the option should be "mpol=interleave" rather than just
> "interleave", who knows what baroque mpols we might want to support
> there in future?

Okay, two pattachments.

The first (against 2.6.10-rc1-mm5) being my reversion of NULL sbinfo
in shmem.c, to make it easier for others to add things into sbinfo
without having to worry about NULL cases. So that goes back to
allocating an sbinfo even for the internal mount: I've rounded up to
L1_CACHE_BYTES to avoid false sharing, but even so, please test it out
on your 512-way to make sure I haven't screwed up the scalability we
got before - thanks. If you find it okay, I'll send to akpm soonish.

The second (against the first) being my take on your patch, with
mpol=interleave, and minor alterations which may irritate you so much
you'll revert them immediately! (mainly, using MPOL_INTERLEAVE and
MPOL_DEFAULT within shmem.c rather than defining separate flags).
Only slightly tested at this end.

Hugh
--- 2.6.10-rc1-mm5/Documentation/filesystems/tmpfs.txt 2004-10-18 22:55:53.000000000 +0100
+++ tmpfs1/Documentation/filesystems/tmpfs.txt 2004-11-11 19:24:14.438121120 +0000
@@ -71,8 +71,8 @@ can be changed on remount. The size par
to limit this tmpfs instance to that percentage of your physical RAM:
the default, when neither size nor nr_blocks is specified, is size=50%

-If both nr_blocks (or size) and nr_inodes are set to 0, neither blocks
-nor inodes will be limited in that instance. It is generally unwise to
+If nr_blocks=0 (or size=0), blocks will not be limited in that instance;
+if nr_inodes=0, inodes will not be limited. It is generally unwise to
mount with such options, since it allows any user with write access to
use up all the memory on the machine; but enhances the scalability of
that instance in a system with many cpus making intensive use of it.
@@ -97,4 +97,4 @@ RAM/SWAP in 10240 inodes and it is only
Author:
Christoph Rohland <cr@xxxxxxx>, 1.12.01
Updated:
- Hugh Dickins <hugh@xxxxxxxxxxx>, 01 September 2004
+ Hugh Dickins <hugh@xxxxxxxxxxx>, 10 November 2004
--- 2.6.10-rc1-mm5/mm/shmem.c 2004-11-11 12:40:13.000000000 +0000
+++ tmpfs1/mm/shmem.c 2004-11-11 19:24:14.441120664 +0000
@@ -194,7 +194,7 @@ static spinlock_t shmem_swaplist_lock =
static void shmem_free_blocks(struct inode *inode, long pages)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo) {
+ if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_blocks += pages;
inode->i_blocks -= pages*BLOCKS_PER_PAGE;
@@ -357,7 +357,7 @@ static swp_entry_t *shmem_swp_alloc(stru
* page (and perhaps indirect index pages) yet to allocate:
* a waste to allocate index if we cannot allocate data.
*/
- if (sbinfo) {
+ if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks <= 1) {
spin_unlock(&sbinfo->stat_lock);
@@ -678,8 +678,8 @@ static void shmem_delete_inode(struct in
spin_unlock(&shmem_swaplist_lock);
}
}
- if (sbinfo) {
- BUG_ON(inode->i_blocks);
+ BUG_ON(inode->i_blocks);
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock);
@@ -1081,7 +1081,7 @@ repeat:
} else {
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo) {
+ if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks == 0 ||
shmem_acct_block(info->flags)) {
@@ -1269,7 +1269,7 @@ shmem_get_inode(struct super_block *sb,
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

- if (sbinfo) {
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
if (!sbinfo->free_inodes) {
spin_unlock(&sbinfo->stat_lock);
@@ -1319,31 +1319,6 @@ shmem_get_inode(struct super_block *sb,
}

#ifdef CONFIG_TMPFS
-
-static int shmem_set_size(struct shmem_sb_info *sbinfo,
- unsigned long max_blocks, unsigned long max_inodes)
-{
- int error;
- unsigned long blocks, inodes;
-
- spin_lock(&sbinfo->stat_lock);
- blocks = sbinfo->max_blocks - sbinfo->free_blocks;
- inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- error = -EINVAL;
- if (max_blocks < blocks)
- goto out;
- if (max_inodes < inodes)
- goto out;
- error = 0;
- sbinfo->max_blocks = max_blocks;
- sbinfo->free_blocks = max_blocks - blocks;
- sbinfo->max_inodes = max_inodes;
- sbinfo->free_inodes = max_inodes - inodes;
-out:
- spin_unlock(&sbinfo->stat_lock);
- return error;
-}
-
static struct inode_operations shmem_symlink_inode_operations;
static struct inode_operations shmem_symlink_inline_operations;

@@ -1598,15 +1573,17 @@ static int shmem_statfs(struct super_blo
buf->f_type = TMPFS_MAGIC;
buf->f_bsize = PAGE_CACHE_SIZE;
buf->f_namelen = NAME_MAX;
- if (sbinfo) {
- spin_lock(&sbinfo->stat_lock);
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ }
+ if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
- spin_unlock(&sbinfo->stat_lock);
}
/* else leave those fields 0 like simple_statfs */
+ spin_unlock(&sbinfo->stat_lock);
return 0;
}

@@ -1663,7 +1640,7 @@ static int shmem_link(struct dentry *old
* but each new link needs a new dentry, pinning lowmem, and
* tmpfs dentries cannot be pruned until they are unlinked.
*/
- if (sbinfo) {
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
if (!sbinfo->free_inodes) {
spin_unlock(&sbinfo->stat_lock);
@@ -1688,7 +1665,7 @@ static int shmem_unlink(struct inode *di

if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo) {
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock);
@@ -1912,22 +1889,42 @@ bad_val:
static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- unsigned long max_blocks = 0;
- unsigned long max_inodes = 0;
+ unsigned long max_blocks = sbinfo->max_blocks;
+ unsigned long max_inodes = sbinfo->max_inodes;
+ unsigned long blocks;
+ unsigned long inodes;
+ int error = -EINVAL;

- if (sbinfo) {
- max_blocks = sbinfo->max_blocks;
- max_inodes = sbinfo->max_inodes;
- }
- if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
- return -EINVAL;
- /* Keep it simple: disallow limited <-> unlimited remount */
- if ((max_blocks || max_inodes) == !sbinfo)
- return -EINVAL;
- /* But allow the pointless unlimited -> unlimited remount */
- if (!sbinfo)
- return 0;
- return shmem_set_size(sbinfo, max_blocks, max_inodes);
+ if (shmem_parse_options(data, NULL, NULL, NULL,
+ &max_blocks, &max_inodes))
+ return error;
+
+ spin_lock(&sbinfo->stat_lock);
+ blocks = sbinfo->max_blocks - sbinfo->free_blocks;
+ inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+ if (max_blocks < blocks)
+ goto out;
+ if (max_inodes < inodes)
+ goto out;
+ /*
+ * Those tests also disallow limited->unlimited while any are in
+ * use, so i_blocks will always be zero when max_blocks is zero;
+ * but we must separately disallow unlimited->limited, because
+ * in that case we have no record of how much is already in use.
+ */
+ if (max_blocks && !sbinfo->max_blocks)
+ goto out;
+ if (max_inodes && !sbinfo->max_inodes)
+ goto out;
+
+ error = 0;
+ sbinfo->max_blocks = max_blocks;
+ sbinfo->free_blocks = max_blocks - blocks;
+ sbinfo->max_inodes = max_inodes;
+ sbinfo->free_inodes = max_inodes - inodes;
+out:
+ spin_unlock(&sbinfo->stat_lock);
+ return error;
}
#endif

@@ -1952,11 +1949,11 @@ static int shmem_fill_super(struct super
uid_t uid = current->fsuid;
gid_t gid = current->fsgid;
int err = -ENOMEM;
-
-#ifdef CONFIG_TMPFS
+ struct shmem_sb_info *sbinfo;
unsigned long blocks = 0;
unsigned long inodes = 0;

+#ifdef CONFIG_TMPFS
/*
* Per default we only allow half of the physical ram per
* tmpfs instance, limiting inodes to one per page of lowmem;
@@ -1967,34 +1964,34 @@ static int shmem_fill_super(struct super
inodes = totalram_pages - totalhigh_pages;
if (inodes > blocks)
inodes = blocks;
-
- if (shmem_parse_options(data, &mode,
- &uid, &gid, &blocks, &inodes))
+ if (shmem_parse_options(data, &mode, &uid, &gid,
+ &blocks, &inodes))
return -EINVAL;
}
-
- if (blocks || inodes) {
- struct shmem_sb_info *sbinfo;
- sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
- if (!sbinfo)
- return -ENOMEM;
- sb->s_fs_info = sbinfo;
- spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_blocks = blocks;
- sbinfo->free_blocks = blocks;
- sbinfo->max_inodes = inodes;
- sbinfo->free_inodes = inodes;
- }
- sb->s_xattr = shmem_xattr_handlers;
#else
sb->s_flags |= MS_NOUSER;
#endif

+ /* Round up to L1_CACHE_BYTES to resist false sharing */
+ sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
+ L1_CACHE_BYTES), GFP_KERNEL);
+ if (!sbinfo)
+ return -ENOMEM;
+
+ spin_lock_init(&sbinfo->stat_lock);
+ sbinfo->max_blocks = blocks;
+ sbinfo->free_blocks = blocks;
+ sbinfo->max_inodes = inodes;
+ sbinfo->free_inodes = inodes;
+
+ sb->s_fs_info = sbinfo;
sb->s_maxbytes = SHMEM_MAX_BYTES;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = TMPFS_MAGIC;
sb->s_op = &shmem_ops;
+ sb->s_xattr = shmem_xattr_handlers;
+
inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
if (!inode)
goto failed;
--- tmpfs1/Documentation/filesystems/tmpfs.txt 2004-11-11 19:24:14.438121120 +0000
+++ tmpfs2/Documentation/filesystems/tmpfs.txt 2004-11-11 19:24:32.173424944 +0000
@@ -78,6 +78,12 @@ use up all the memory on the machine; bu
that instance in a system with many cpus making intensive use of it.


+tmpfs has a mount option to set the NUMA memory allocation policy for
+all files in that instance:
+mpol=interleave prefers to allocate memory from each node in turn
+mpol=default prefers to allocate memory from the local node
+
+
To specify the initial root directory you can use the following mount
options:

--- tmpfs1/fs/hugetlbfs/inode.c 2004-11-11 12:39:59.000000000 +0000
+++ tmpfs2/fs/hugetlbfs/inode.c 2004-11-11 19:24:32.174424792 +0000
@@ -400,7 +400,7 @@ static struct inode *hugetlbfs_get_inode
inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
info = HUGETLBFS_I(inode);
- mpol_shared_policy_init(&info->policy);
+ mpol_shared_policy_init(&info->policy, MPOL_DEFAULT);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
--- tmpfs1/include/linux/mempolicy.h 2004-11-11 12:40:09.000000000 +0000
+++ tmpfs2/include/linux/mempolicy.h 2004-11-11 19:24:32.175424640 +0000
@@ -137,12 +137,7 @@ struct shared_policy {
spinlock_t lock;
};

-static inline void mpol_shared_policy_init(struct shared_policy *info)
-{
- info->root = RB_ROOT;
- spin_lock_init(&info->lock);
-}
-
+void mpol_shared_policy_init(struct shared_policy *info, int policy);
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma,
struct mempolicy *new);
@@ -198,7 +193,8 @@ static inline int mpol_set_shared_policy
return -EINVAL;
}

-static inline void mpol_shared_policy_init(struct shared_policy *info)
+static inline void mpol_shared_policy_init(struct shared_policy *info,
+ int policy)
{
}

--- tmpfs1/include/linux/shmem_fs.h 2004-10-18 22:56:50.000000000 +0100
+++ tmpfs2/include/linux/shmem_fs.h 2004-11-11 19:24:32.175424640 +0000
@@ -26,6 +26,7 @@ struct shmem_sb_info {
unsigned long free_blocks; /* How many are left for allocation */
unsigned long max_inodes; /* How many inodes are allowed */
unsigned long free_inodes; /* How many are left for allocation */
+ int policy; /* Default NUMA memory alloc policy */
spinlock_t stat_lock;
};

--- tmpfs1/mm/mempolicy.c 2004-11-11 12:40:12.000000000 +0000
+++ tmpfs2/mm/mempolicy.c 2004-11-11 19:24:32.176424488 +0000
@@ -1060,6 +1060,28 @@ restart:
return 0;
}

+void mpol_shared_policy_init(struct shared_policy *info, int policy)
+{
+ info->root = RB_ROOT;
+ spin_lock_init(&info->lock);
+
+ if (policy != MPOL_DEFAULT) {
+ struct mempolicy *newpol;
+
+ /* Falls back to MPOL_DEFAULT on any error */
+ newpol = mpol_new(policy, nodes_addr(node_online_map));
+ if (!IS_ERR(newpol)) {
+ /* Create pseudo-vma that contains just the policy */
+ struct vm_area_struct pvma;
+
+ memset(&pvma, 0, sizeof(struct vm_area_struct));
+ /* Policy covers entire file */
+ pvma.vm_end = ~0UL;
+ mpol_set_shared_policy(info, &pvma, newpol);
+ }
+ }
+}
+
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma, struct mempolicy *npol)
{
--- tmpfs1/mm/shmem.c 2004-11-11 19:24:14.441120664 +0000
+++ tmpfs2/mm/shmem.c 2004-11-11 19:24:32.178424184 +0000
@@ -1292,7 +1292,7 @@ shmem_get_inode(struct super_block *sb,
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
- mpol_shared_policy_init(&info->policy);
+ mpol_shared_policy_init(&info->policy, sbinfo->policy);
INIT_LIST_HEAD(&info->swaplist);

switch (mode & S_IFMT) {
@@ -1817,7 +1817,7 @@ static struct inode_operations shmem_sym
#endif
};

-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes, int *policy)
{
char *this_char, *value, *rest;

@@ -1871,6 +1871,13 @@ static int shmem_parse_options(char *opt
*gid = simple_strtoul(value,&rest,0);
if (*rest)
goto bad_val;
+ } else if (!strcmp(this_char,"mpol")) {
+ if (!strcmp(value,"interleave"))
+ *policy = MPOL_INTERLEAVE;
+ else if (!strcmp(value,"default"))
+ *policy = MPOL_DEFAULT;
+ else
+ goto bad_val;
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
this_char);
@@ -1891,12 +1898,13 @@ static int shmem_remount_fs(struct super
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
unsigned long max_blocks = sbinfo->max_blocks;
unsigned long max_inodes = sbinfo->max_inodes;
+ int policy = sbinfo->policy;
unsigned long blocks;
unsigned long inodes;
int error = -EINVAL;

if (shmem_parse_options(data, NULL, NULL, NULL,
- &max_blocks, &max_inodes))
+ &max_blocks, &max_inodes, &policy))
return error;

spin_lock(&sbinfo->stat_lock);
@@ -1922,6 +1930,7 @@ static int shmem_remount_fs(struct super
sbinfo->free_blocks = max_blocks - blocks;
sbinfo->max_inodes = max_inodes;
sbinfo->free_inodes = max_inodes - inodes;
+ sbinfo->policy = policy;
out:
spin_unlock(&sbinfo->stat_lock);
return error;
@@ -1952,6 +1961,7 @@ static int shmem_fill_super(struct super
struct shmem_sb_info *sbinfo;
unsigned long blocks = 0;
unsigned long inodes = 0;
+ int policy = MPOL_DEFAULT;

#ifdef CONFIG_TMPFS
/*
@@ -1965,7 +1975,7 @@ static int shmem_fill_super(struct super
if (inodes > blocks)
inodes = blocks;
if (shmem_parse_options(data, &mode, &uid, &gid,
- &blocks, &inodes))
+ &blocks, &inodes, &policy))
return -EINVAL;
}
#else
@@ -1983,6 +1993,7 @@ static int shmem_fill_super(struct super
sbinfo->free_blocks = blocks;
sbinfo->max_inodes = inodes;
sbinfo->free_inodes = inodes;
+ sbinfo->policy = policy;

sb->s_fs_info = sbinfo;
sb->s_maxbytes = SHMEM_MAX_BYTES;