Re: [PATCH] fix race in mark_mounts_for_expiry()

From: Ram
Date: Wed May 18 2005 - 13:50:38 EST


On Wed, 2005-05-18 at 09:53, Miklos Szeredi wrote:
> > E.g. having a separate task count, which is incremented/decremented
> > only by clone/exit. If the task count goes to zero, umount_tree is
> > called on root, but namespace is not freed.
> >
> > And each mnt_namespace holds a proper reference to the namespace, so
> > it's safe to dereference it anytime. When truly no more references
> > remain, the namespace can go away.
> >
> > Hmm?

First of all the reason this race exists implies Al Viro may have had
assertion in his mind that "All tasks that have access to a namespace
has a refcount on that namespace". If that was what he was thinking,
than the I would stick with that assertion being true. The overall idea
I am thinking off is:

1) have the automounter hold a refcount on any new namespace created
the mounts in which the automounter has interest in.
2) have a refcount on the namespace when a new task gains access to
a namespace through the file descriptor or any other
similar mechanisms and remove the reference
once the fd gets closed. (bit tricky to implement)

Do you agree with the overall idea?
If so I will try to come up with a patch.

RP

>
> Here's a patch (compile and boot tested).
>
> Now there are two solutions to one problem, the previous one is ugly,
> this one is buggy. Which should it be?
>
> Thanks,
> Miklos
>
> Index: linux/include/linux/namespace.h
> ===================================================================
> --- linux.orig/include/linux/namespace.h 2005-05-18 18:02:24.000000000 +0200
> +++ linux/include/linux/namespace.h 2005-05-18 18:28:48.000000000 +0200
> @@ -7,18 +7,26 @@
>
> struct namespace {
> atomic_t count;
> + atomic_t task_count; /* how many tasks use this */
> struct vfsmount * root;
> struct list_head list;
> struct rw_semaphore sem;
> };
>
> extern int copy_namespace(int, struct task_struct *);
> -extern void __put_namespace(struct namespace *namespace);
> +extern void __exit_namespace(struct namespace *namespace);
>
> static inline void put_namespace(struct namespace *namespace)
> {
> - if (atomic_dec_and_test(&namespace->count))
> - __put_namespace(namespace);
> + if (namespace && atomic_dec_and_test(&namespace->count))
> + kfree(namespace);
> +}
> +
> +static inline struct namespace *get_namespace(struct namespace *namespace)
> +{
> + if (namespace)
> + atomic_inc(&namespace->count);
> + return namespace;
> }
>
> static inline void exit_namespace(struct task_struct *p)
> @@ -28,14 +36,10 @@ static inline void exit_namespace(struct
> task_lock(p);
> p->namespace = NULL;
> task_unlock(p);
> - put_namespace(namespace);
> + if (atomic_dec_and_test(&p->namespace->task_count))
> + __exit_namespace(p->namespace);
> }
> }
>
> -static inline void get_namespace(struct namespace *namespace)
> -{
> - atomic_inc(&namespace->count);
> -}
> -
> #endif
> #endif
> Index: linux/fs/namespace.c
> ===================================================================
> --- linux.orig/fs/namespace.c 2005-05-18 17:42:46.000000000 +0200
> +++ linux/fs/namespace.c 2005-05-18 18:32:29.000000000 +0200
> @@ -160,7 +160,7 @@ clone_mnt(struct vfsmount *old, struct d
> mnt->mnt_root = dget(root);
> mnt->mnt_mountpoint = mnt->mnt_root;
> mnt->mnt_parent = mnt;
> - mnt->mnt_namespace = current->namespace;
> + mnt->mnt_namespace = get_namespace(current->namespace);
>
> /* stick the duplicate mount on the same expiry list
> * as the original if that was on one */
> @@ -345,13 +345,15 @@ static void umount_tree(struct vfsmount
> for (p = mnt; p; p = next_mnt(p, mnt)) {
> list_del(&p->mnt_list);
> list_add(&p->mnt_list, &kill);
> - p->mnt_namespace = NULL;
> }
>
> while (!list_empty(&kill)) {
> + struct namespace *n;
> mnt = list_entry(kill.next, struct vfsmount, mnt_list);
> list_del_init(&mnt->mnt_list);
> list_del_init(&mnt->mnt_fslink);
> + n = mnt->mnt_namespace;
> + mnt->mnt_namespace = NULL;
> if (mnt->mnt_parent == mnt) {
> spin_unlock(&vfsmount_lock);
> } else {
> @@ -361,6 +363,7 @@ static void umount_tree(struct vfsmount
> path_release(&old_nd);
> }
> mntput(mnt);
> + put_namespace(n);
> spin_lock(&vfsmount_lock);
> }
> }
> @@ -866,12 +869,9 @@ void mark_mounts_for_expiry(struct list_
> mnt = list_entry(graveyard.next, struct vfsmount, mnt_fslink);
> list_del_init(&mnt->mnt_fslink);
>
> - /* don't do anything if the namespace is dead - all the
> - * vfsmounts from it are going away anyway */
> - namespace = mnt->mnt_namespace;
> - if (!namespace || atomic_read(&namespace->count) <= 0)
> + namespace = get_namespace(mnt->mnt_namespace);
> + if (!namespace)
> continue;
> - get_namespace(namespace);
>
> spin_unlock(&vfsmount_lock);
> down_write(&namespace->sem);
> @@ -1073,8 +1073,10 @@ int copy_namespace(int flags, struct tas
>
> get_namespace(namespace);
>
> - if (!(flags & CLONE_NEWNS))
> + if (!(flags & CLONE_NEWNS)) {
> + atomic_inc(&namespace->task_count);
> return 0;
> + }
>
> if (!capable(CAP_SYS_ADMIN)) {
> put_namespace(namespace);
> @@ -1086,6 +1088,7 @@ int copy_namespace(int flags, struct tas
> goto out;
>
> atomic_set(&new_ns->count, 1);
> + atomic_set(&new_ns->task_count, 1);
> init_rwsem(&new_ns->sem);
> INIT_LIST_HEAD(&new_ns->list);
>
> @@ -1109,7 +1112,9 @@ int copy_namespace(int flags, struct tas
> p = namespace->root;
> q = new_ns->root;
> while (p) {
> - q->mnt_namespace = new_ns;
> + struct namespace *oldns = q->mnt_namespace;
> + q->mnt_namespace = get_namespace(new_ns);
> + put_namespace(oldns);
> if (fs) {
> if (p == fs->rootmnt) {
> rootmnt = p;
> @@ -1381,16 +1386,17 @@ static void __init init_mount_tree(void)
> if (!namespace)
> panic("Can't allocate initial namespace");
> atomic_set(&namespace->count, 1);
> + atomic_set(&namespace->task_count, 0);
> INIT_LIST_HEAD(&namespace->list);
> init_rwsem(&namespace->sem);
> list_add(&mnt->mnt_list, &namespace->list);
> namespace->root = mnt;
> - mnt->mnt_namespace = namespace;
> + mnt->mnt_namespace = get_namespace(namespace);
>
> init_task.namespace = namespace;
> read_lock(&tasklist_lock);
> do_each_thread(g, p) {
> - get_namespace(namespace);
> + atomic_inc(&namespace->task_count);
> p->namespace = namespace;
> } while_each_thread(g, p);
> read_unlock(&tasklist_lock);
> @@ -1448,12 +1454,13 @@ void __init mnt_init(unsigned long mempa
> init_mount_tree();
> }
>
> -void __put_namespace(struct namespace *namespace)
> +void __exit_namespace(struct namespace *namespace)
> {
> down_write(&namespace->sem);
> spin_lock(&vfsmount_lock);
> umount_tree(namespace->root);
> + namespace->root = NULL;
> spin_unlock(&vfsmount_lock);
> up_write(&namespace->sem);
> - kfree(namespace);
> + put_namespace(namespace);
> }
> Index: linux/fs/super.c
> ===================================================================
> --- linux.orig/fs/super.c 2005-05-17 13:46:56.000000000 +0200
> +++ linux/fs/super.c 2005-05-18 18:44:18.000000000 +0200
> @@ -37,6 +37,7 @@
> #include <linux/writeback.h> /* for the emergency remount stuff */
> #include <linux/idr.h>
> #include <linux/kobject.h>
> +#include <linux/namespace.h>
> #include <asm/uaccess.h>
>
>
> @@ -842,7 +843,7 @@ do_kern_mount(const char *fstype, int fl
> mnt->mnt_root = dget(sb->s_root);
> mnt->mnt_mountpoint = sb->s_root;
> mnt->mnt_parent = mnt;
> - mnt->mnt_namespace = current->namespace;
> + mnt->mnt_namespace = get_namespace(current->namespace);
> up_write(&sb->s_umount);
> put_filesystem(type);
> return mnt;

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/