Re: [patch 2/4] mempolicy: support optional mode flags

From: Lee Schermerhorn
Date: Tue Feb 12 2008 - 19:14:37 EST


On Mon, 2008-02-11 at 07:30 -0800, David Rientjes wrote:
> With the evolution of mempolicies, it is necessary to support mempolicy
> mode flags that specify how the policy shall behave in certain
> circumstances. The most immediate need for mode flag support is to
> suppress remapping the nodemask of a policy at the time of rebind.
>
> With the small number of possible mempolicy modes (currently four) and
> the large size of the struct mempolicy member that stores this mode
> (unsigned short), it is possible to store both the mode and optional mode
> flags in the same member.
>
> To do this, it is necessary to mask off the optional mode flag bits when
> testing the mempolicy mode. A new constant, MPOL_FLAG_SHIFT, indicates
> the shift necessary to find the flag bits within struct mempolicy's
> policy member. With this, MPOL_MODE_MASK can be defined to mask off the
> optional flags, yielding just the mode itself.
>
> A helper function:
>
> unsigned char mpol_mode(unsigned short mode)
>
> has been implemented to return only a policy's mode. Notice that the
> return type is unsigned char since MPOL_FLAG_SHIFT is currently defined
> at eight. mpol_flags(unsigned short mode) does the inverse.
>
> While this requires frequent use of mpol_mode() within the internal
> mempolicy code, it is easy to distinguish between actuals that contain
> only modes and those that contain the entire policy member (modes and
> flags). If the formal uses enum mempolicy_mode, only the mode itself
> may successfully be passed because of type checking.

What is the benefit of pulling the flags and mode apart at the user
interface, passing them as separate args to mpol_new(), do_* and
mpol_shared_policy_init() and then stitching them back together in
mpol_new()? Modes passed in via [sys_]{set_mempolicy|mbind}() and
those stored in the the shmem_sb_info can already have the flags there,
so this seems like extra work.

I think this patch and the previous one [1/4] would be simplified if the
formal parameters were left as an int [mabye, unsigned] and the flags
were masked of when necessary in mpol_new() and elsewhere.

[more comments below]

>
> Note that this does not break userspace code that relies on
> get_mempolicy(&policy, ...) and either 'switch (policy)' statements or
> 'if (policy == MPOL_INTERLEAVE)' statements. Such applications would
> need to use optional mode flags when calling set_mempolicy() or mbind()
> for these previously implemented statements to stop working. If an
> application does start using optional mode flags, it will need to use
> mpol_mode() in switch and conditional statements that only test mode.

> Cc: Paul Jackson <pj@xxxxxxx>
> Cc: Christoph Lameter <clameter@xxxxxxx>
> Cc: Lee Schermerhorn <Lee.Schermerhorn@xxxxxx>
> Cc: Andi Kleen <ak@xxxxxxx>
> Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx>
> ---
> fs/hugetlbfs/inode.c | 2 +-
> include/linux/mempolicy.h | 39 +++++++++++++++++++++--
> mm/mempolicy.c | 77 ++++++++++++++++++++++++++------------------
> mm/shmem.c | 15 ++++++--
> 4 files changed, 93 insertions(+), 40 deletions(-)
>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -504,7 +504,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
> inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> INIT_LIST_HEAD(&inode->i_mapping->private_list);
> info = HUGETLBFS_I(inode);
> - mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
> + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0, NULL);
> switch (mode & S_IFMT) {
> default:
> init_special_inode(inode, mode, dev);
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -17,7 +17,18 @@ enum mempolicy_mode {
> MPOL_MAX, /* always last member of mempolicy_mode */
> };
>
> -/* Flags for get_mem_policy */
> +/*
> + * The lower MPOL_FLAG_SHIFT bits of the policy mode represent the MPOL_*
> + * constants defined in enum mempolicy_mode. The upper bits represent
> + * optional set_mempolicy() MPOL_F_* mode flags.
> + */
> +#define MPOL_FLAG_SHIFT (8)
> +#define MPOL_MODE_MASK ((1 << MPOL_FLAG_SHIFT) - 1)
> +
> +/* Flags for set_mempolicy */
> +#define MPOL_MODE_FLAGS (0) /* legal set_mempolicy() MPOL_* mode flags */
> +
> +/* Flags for get_mempolicy */
> #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
> #define MPOL_F_ADDR (1<<1) /* look up vma using address */
> #define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
> @@ -115,6 +126,16 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
>
> #define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)
>
> +static inline unsigned char mpol_mode(unsigned short mode)
> +{
> + return mode & MPOL_MODE_MASK;
> +}
> +
> +static inline unsigned short mpol_flags(unsigned short mode)
> +{
> + return mode & ~MPOL_MODE_MASK;
> +}
> +
> /*
> * Tree of shared policies for a shared memory region.
> * Maintain the policies in a pseudo mm that contains vmas. The vmas
> @@ -135,7 +156,8 @@ struct shared_policy {
> };
>
> void mpol_shared_policy_init(struct shared_policy *info,
> - enum mempolicy_mode policy, nodemask_t *nodes);
> + enum mempolicy_mode policy, unsigned short flags,
> + nodemask_t *nodes);
> int mpol_set_shared_policy(struct shared_policy *info,
> struct vm_area_struct *vma,
> struct mempolicy *new);
> @@ -176,6 +198,16 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
> }
> #define vma_mpol_equal(a,b) 1
>
> +static inline unsigned char mpol_mode(unsigned short mode)
> +{
> + return 0;
> +}
> +
> +static inline unsigned short mpol_flags(unsigned short mode)
> +{
> + return 0;
> +}
> +
> #define mpol_set_vma_default(vma) do {} while(0)
>
> static inline void mpol_free(struct mempolicy *p)
> @@ -201,7 +233,8 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
> }
>
> static inline void mpol_shared_policy_init(struct shared_policy *info,
> - enum mempolicy_type policy, nodemask_t *nodes)
> + enum mempolicy_type policy, unsigned short flags,
> + nodemask_t *nodes)
> {
> }
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -203,18 +203,20 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
> }
>
> /* Create a new policy */
> -static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
> +static struct mempolicy *mpol_new(enum mempolicy_mode mode,
> + unsigned short flags, nodemask_t *nodes)
> {
> struct mempolicy *policy;
>
> - pr_debug("setting mode %d nodes[0] %lx\n",
> - mode, nodes ? nodes_addr(*nodes)[0] : -1);
> + pr_debug("setting mode %d flags %d nodes[0] %lx\n",
> + mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
>
> if (mode == MPOL_DEFAULT)
> return NULL;
> policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
> if (!policy)
> return ERR_PTR(-ENOMEM);
> + flags &= MPOL_MODE_FLAGS;
> atomic_set(&policy->refcnt, 1);
> switch (mode) {
> case MPOL_INTERLEAVE:
> @@ -240,7 +242,7 @@ static struct mempolicy *mpol_new(enum mempolicy_mode mode, nodemask_t *nodes)
> default:
> BUG();
> }
> - policy->policy = mode;
> + policy->policy = mode | flags;
> policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
> return policy;
> }
> @@ -483,19 +485,20 @@ static void mpol_set_task_struct_flag(void)
> }
>
> /* Set the process memory policy */
> -static long do_set_mempolicy(enum mempolicy_mode mode, nodemask_t *nodes)
> +static long do_set_mempolicy(enum mempolicy_mode mode, unsigned short flags,
> + nodemask_t *nodes)
> {
> struct mempolicy *new;
>
> if (mpol_check_policy(mode, nodes))
> return -EINVAL;
> - new = mpol_new(mode, nodes);
> + new = mpol_new(mode, flags, nodes);
> if (IS_ERR(new))
> return PTR_ERR(new);
> mpol_free(current->mempolicy);
> current->mempolicy = new;
> mpol_set_task_struct_flag();
> - if (new && new->policy == MPOL_INTERLEAVE)
> + if (new && mpol_mode(new->policy) == MPOL_INTERLEAVE)
> current->il_next = first_node(new->v.nodes);
> return 0;
> }
> @@ -506,7 +509,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
> int i;
>
> nodes_clear(*nodes);
> - switch (p->policy) {
> + switch (mpol_mode(p->policy)) {
> case MPOL_BIND:
> for (i = 0; p->v.zonelist->zones[i]; i++)
> node_set(zone_to_nid(p->v.zonelist->zones[i]),
> @@ -588,7 +591,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
> goto out;
> *policy = err;
> } else if (pol == current->mempolicy &&
> - pol->policy == MPOL_INTERLEAVE) {
> + mpol_mode(pol->policy) == MPOL_INTERLEAVE) {
> *policy = current->il_next;
> } else {
> err = -EINVAL;
> @@ -785,8 +788,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
> #endif
>
> static long do_mbind(unsigned long start, unsigned long len,
> - enum mempolicy_mode mode, nodemask_t *nmask,
> - unsigned long flags)
> + enum mempolicy_mode mode, unsigned short mode_flags,
> + nodemask_t *nmask, unsigned long flags)
> {
> struct vm_area_struct *vma;
> struct mm_struct *mm = current->mm;
> @@ -818,7 +821,7 @@ static long do_mbind(unsigned long start, unsigned long len,
> if (mpol_check_policy(mode, nmask))
> return -EINVAL;
>
> - new = mpol_new(mode, nmask);
> + new = mpol_new(mode, mode_flags, nmask);
> if (IS_ERR(new))
> return PTR_ERR(new);
>
> @@ -829,8 +832,9 @@ static long do_mbind(unsigned long start, unsigned long len,
> if (!new)
> flags |= MPOL_MF_DISCONTIG_OK;
>
> - pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
> - mode, nmask ? nodes_addr(*nmask)[0] : -1);
> + pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
> + start, start + len, mode, mode_flags,
> + nmask ? nodes_addr(*nmask)[0] : -1);
>
> down_write(&mm->mmap_sem);
> vma = check_range(mm, start, end, nmask,
> @@ -929,13 +933,16 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
> {
> nodemask_t nodes;
> int err;
> + unsigned short mode_flags;
>
> + mode_flags = mpol_flags(mode);
>
>
I suggest restricting the flags to the defined flags here. This gives
us the ability to defined new flags w/o breaking [changing behavior of]
applications that accidently pass undefined flags. An error return
forced the user to fix the application [assuming they check error
returns].

> + mode = mpol_mode(mode);
> if (mode >= MPOL_MAX)
> return -EINVAL;
> err = get_nodes(&nodes, nmask, maxnode);
> if (err)
> return err;
> - return do_mbind(start, len, mode, &nodes, flags);
> + return do_mbind(start, len, mode, mode_flags, &nodes, flags);
> }
>
> /* Set the process memory policy */
> @@ -944,13 +951,16 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
> {
> int err;
> nodemask_t nodes;
> + unsigned short flags;
>
> + flags = mpol_flags(mode);

Suggest similar arg checking here.

> + mode = mpol_mode(mode);
> if (mode < 0 || mode >= MPOL_MAX)

Now that we're extracting an unsigned char from the mode via
mpol_mode(), I think we can drop the 'mode < 0 ||'. This would also be
the case if we didn't pull the flags and mode apart and just used:

if (mpol_mode(mode) < MPOL_MAX)
...

> return -EINVAL;
> err = get_nodes(&nodes, nmask, maxnode);
> if (err)
> return err;
> - return do_set_mempolicy(mode, &nodes);
> + return do_set_mempolicy(mode, flags, &nodes);
> }
>
> asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
> @@ -1152,7 +1162,7 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
> pol = vma->vm_ops->get_policy(vma, addr);
> shared_pol = 1; /* if pol non-NULL, add ref below */
> } else if (vma->vm_policy &&
> - vma->vm_policy->policy != MPOL_DEFAULT)
> + mpol_mode(vma->vm_policy->policy) != MPOL_DEFAULT)
> pol = vma->vm_policy;
> }
> if (!pol)
> @@ -1167,7 +1177,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
> {
> int nd;
>
> - switch (policy->policy) {
> + switch (mpol_mode(policy->policy)) {
> case MPOL_PREFERRED:
> nd = policy->v.preferred_node;
> if (nd < 0)
> @@ -1211,7 +1221,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
> */
> unsigned slab_node(struct mempolicy *policy)
> {
> - enum mempolicy_mode pol = policy ? policy->policy : MPOL_DEFAULT;
> + enum mempolicy_mode pol = policy ? mpol_mode(policy->policy) :
> + MPOL_DEFAULT;
>
> switch (pol) {
> case MPOL_INTERLEAVE:
> @@ -1297,7 +1308,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
> struct zonelist *zl;
>
> *mpol = NULL; /* probably no unref needed */
> - if (pol->policy == MPOL_INTERLEAVE) {
> + if (mpol_mode(pol->policy) == MPOL_INTERLEAVE) {
> unsigned nid;
>
> nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
> @@ -1307,7 +1318,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
>
> zl = zonelist_policy(GFP_HIGHUSER, pol);
> if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
> - if (pol->policy != MPOL_BIND)
> + if (mpol_mode(pol->policy) != MPOL_BIND)
> __mpol_free(pol); /* finished with pol */
> else
> *mpol = pol; /* unref needed after allocation */
> @@ -1361,7 +1372,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
>
> cpuset_update_task_memory_state();
>
> - if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
> + if (unlikely(mpol_mode(pol->policy) == MPOL_INTERLEAVE)) {
> unsigned nid;
>
> nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
> @@ -1409,7 +1420,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
> cpuset_update_task_memory_state();
> if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
> pol = &default_policy;
> - if (pol->policy == MPOL_INTERLEAVE)
> + if (mpol_mode(pol->policy) == MPOL_INTERLEAVE)
> return alloc_page_interleave(gfp, order, interleave_nodes(pol));
> return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
> }
> @@ -1436,7 +1447,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
> }
> *new = *old;
> atomic_set(&new->refcnt, 1);
> - if (new->policy == MPOL_BIND) {
> + if (mpol_mode(new->policy) == MPOL_BIND) {
> int sz = ksize(old->v.zonelist);
> new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
> if (!new->v.zonelist) {
> @@ -1454,7 +1465,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
> return 0;
> if (a->policy != b->policy)
> return 0;
> - switch (a->policy) {
> + switch (mpol_mode(a->policy)) {
> case MPOL_DEFAULT:
> return 1;
> case MPOL_INTERLEAVE:
> @@ -1479,7 +1490,7 @@ void __mpol_free(struct mempolicy *p)
> {
> if (!atomic_dec_and_test(&p->refcnt))
> return;
> - if (p->policy == MPOL_BIND)
> + if (mpol_mode(p->policy) == MPOL_BIND)
> kfree(p->v.zonelist);
> p->policy = MPOL_DEFAULT;
> kmem_cache_free(policy_cache, p);
> @@ -1640,7 +1651,8 @@ restart:
> }
>
> void mpol_shared_policy_init(struct shared_policy *info,
> - enum mempolicy_mode policy, nodemask_t *policy_nodes)
> + enum mempolicy_mode policy, unsigned short flags,
> + nodemask_t *policy_nodes)
> {
> info->root = RB_ROOT;
> spin_lock_init(&info->lock);
> @@ -1649,7 +1661,7 @@ void mpol_shared_policy_init(struct shared_policy *info,
> struct mempolicy *newpol;
>
> /* Falls back to MPOL_DEFAULT on any error */
> - newpol = mpol_new(policy, policy_nodes);
> + newpol = mpol_new(policy, flags, policy_nodes);
> if (!IS_ERR(newpol)) {
> /* Create pseudo-vma that contains just the policy */
> struct vm_area_struct pvma;
> @@ -1745,14 +1757,14 @@ void __init numa_policy_init(void)
> if (unlikely(nodes_empty(interleave_nodes)))
> node_set(prefer, interleave_nodes);
>
> - if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
> + if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
> printk("numa_policy_init: interleaving failed\n");
> }
>
> /* Reset policy of current process to default */
> void numa_default_policy(void)
> {
> - do_set_mempolicy(MPOL_DEFAULT, NULL);
> + do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
> }
>
> /* Migrate a policy to a different set of nodes */
> @@ -1768,7 +1780,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
> if (nodes_equal(*mpolmask, *newmask))
> return;
>
> - switch (pol->policy) {
> + switch (mpol_mode(pol->policy)) {
> case MPOL_DEFAULT:
> break;
> case MPOL_INTERLEAVE:
> @@ -1858,7 +1870,8 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
> char *p = buffer;
> int l;
> nodemask_t nodes;
> - enum mempolicy_mode mode = pol ? pol->policy : MPOL_DEFAULT;
> + enum mempolicy_mode mode = pol ? mpol_mode(pol->policy) :
> + MPOL_DEFAULT;
>
> switch (mode) {
> case MPOL_DEFAULT:
> diff --git a/mm/shmem.c b/mm/shmem.c
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1086,6 +1086,7 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
> nodemask_t *policy_nodes)
> {
> char *nodelist = strchr(value, ':');
> + char *flags = strchr(value, '=');
> int err = 1;
>
> if (nodelist) {
> @@ -1096,6 +1097,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
> if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
> goto out;
> }
> + if (flags)
> + *flags++ = '\0';
> if (!strcmp(value, "default")) {
> *policy = MPOL_DEFAULT;
> /* Don't allow a nodelist */
> @@ -1125,6 +1128,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
> *policy_nodes = node_states[N_HIGH_MEMORY];
> err = 0;
> }
> + if (flags) {
> + }
> out:
> /* Restore string for error message */
> if (nodelist)
> @@ -1154,7 +1159,7 @@ static void shmem_show_mpol(struct seq_file *seq, enum mempolicy_mode policy,
>
> seq_printf(seq, ",mpol=%s", policy_string);
>
> - if (policy != MPOL_INTERLEAVE ||
> + if (mpol_mode(policy) != MPOL_INTERLEAVE ||
> !nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) {
> char buffer[64];
> int len;
> @@ -1577,8 +1582,10 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
> case S_IFREG:
> inode->i_op = &shmem_inode_operations;
> inode->i_fop = &shmem_file_operations;
> - mpol_shared_policy_init(&info->policy, sbinfo->policy,
> - &sbinfo->policy_nodes);
> + mpol_shared_policy_init(&info->policy,
> + mpol_mode(sbinfo->policy),
> + mpol_flags(sbinfo->policy),
> + &sbinfo->policy_nodes);
> break;
> case S_IFDIR:
> inc_nlink(inode);
> @@ -1592,7 +1599,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
> * Must not load anything in the rbtree,
> * mpol_free_shared_policy will not be called.
> */
> - mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
> + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0,
> NULL);
> break;
> }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/