[GIT PULL] Detaching mounts on unlink for 3.15-rc1

From: Eric W. Biederman
Date: Tue Apr 08 2014 - 20:22:09 EST

Next message: Luiz Capitulino: "Re: [PATCH 4/4] hugetlb: add support for gigantic page allocation at runtime"
Previous message: Jan Hubicka: "Re: [GIT] kbuild/lto changes for 3.15-rc1"
Next in thread: Al Viro: "Re: [GIT PULL] Detaching mounts on unlink for 3.15-rc1"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Linus,

Please pull the for-linus branch from the git tree:

git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-linus

HEAD: 0d7d90f86f83f29a442b37c78172870f8ee28c58 proc: Update proc_flush_task_mnt to use d_invalidate

My apologies for sending this pull request late. I thought these
changes were going to come through Al's tree that doesn't look like it
is going to happen, and it is long past time for these changes to be
merged.

This set of changes has been reviewed and been sitting idle for the last
6 weeks. In that time the vfs has slightly shifted under me the new
version of rename and the mount hash list becoming a hlist. None of
those changes has caused changed the code in ways to invalidate these
changes, but small conflicts do result and I have attached my conflict
resolution at the end of this email in case it helps.

To recap these changes allow a file or a directory that is a mount point
in one mount namespace to be unlinked/rmdired elsewhere where it is not
a mount point (either a remote filesystem or another mount namespace).
As has been agreed during review semantics when only a single mount
namespace exists remain unchanged.

This removes a long standing need to lie to the vfs when a mount point
has been removed behind it's back. This also removes a DOS attack where
an unprivileged user could prevent root from renaming or deleting files
and directories by using them as mountpoints in another mount namespace.

This change also fixes a few cases where because we were not lying to
the vfs we could leak mount points.

When renaming or unlinking directory entries that are not mountpoints
no additional locks are taken so no performance differences can result,
and my benchmark reflected that.

To verify that nothing significant from the time this code was
written until now, I have performed a test merge. I successfully
performed an allyesconfig build (skipping a broken wireless driver in
staging) and tested to make certain that the code still functions
as expected.

Eric W. Biederman (12):
vfs: Document the effect of d_revalidate on d_find_alias
vfs: More precise tests in d_invalidate
vfs: Don't allow overwriting mounts in the current mount namespace
vfs: Keep a list of mounts on a mount point
vfs: factor out lookup_mountpoint from new_mountpoint
vfs: Add a function to lazily unmount all mounts from any dentry.
vfs: Lazily remove mounts on unlinked files and directories.
vfs: Remove unnecessary calls of check_submounts_and_drop
vfs: Merge check_submounts_and_drop and d_invalidate
vfs: Make d_invalidate return void
vfs: Remove d_drop calls from d_revalidate implementations
proc: Update proc_flush_task_mnt to use d_invalidate

fs/afs/dir.c | 5 --
fs/btrfs/ioctl.c | 5 +--
fs/ceph/dir.c | 1 -
fs/cifs/readdir.c | 6 +--
fs/dcache.c | 140 +++++++++++++++++-------------------------------
fs/fuse/dir.c | 7 +--
fs/gfs2/dentry.c | 3 -
fs/kernfs/dir.c | 11 ----
fs/mount.h | 20 +++++++
fs/namei.c | 28 ++++++----
fs/namespace.c | 87 +++++++++++++++++++++++++++++-
fs/nfs/dir.c | 7 +--
fs/proc/base.c | 10 +---
fs/proc/fd.c | 2 -
include/linux/dcache.h | 3 +-
15 files changed, 178 insertions(+), 157 deletions(-)

Eric

---

My merge conflict resolution.

__d_move gained an argument I took it out of a conditional.

m_hash went from a struct list to a struct hlist changing nearby code,
and affecting my factoring out of lookup_mountpoint from new_mountpoint.

There was a major refactoring of rename. As long as d_mountpoint
becomes is_local_mounptoint and detach_mount is added after dont_mount
all is well.

dcache.c | 6 ++----
mount.h | 1 +
namei.c | 3 ++-
namespace.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
4 files changed, 51 insertions(+), 7 deletions(-)

diff --cc fs/dcache.c
index 66cba5a8a346,5b78bd98649c..d4a1e55d65a9
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@@ -2701,10 -2631,8 +2663,8 @@@ static struct dentry *__d_unalias(struc
goto out_err;
m2 = &alias->d_parent->d_inode->i_mutex;
out_unalias:
- if (likely(!d_mountpoint(alias))) {
- __d_move(alias, dentry, false);
- ret = alias;
- }
- __d_move(alias, dentry);
++ __d_move(alias, dentry, false);
+ ret = alias;
out_err:
spin_unlock(&inode->i_lock);
if (m2)
diff --cc fs/mount.h
index b29e42f05f34,c5e717542bbc..aa3c0aa473df
--- a/fs/mount.h
+++ b/fs/mount.h
@@@ -19,8 -19,9 +19,9 @@@ struct mnt_pcp
};

struct mountpoint {
- struct list_head m_hash;
+ struct hlist_node m_hash;
struct dentry *m_dentry;
+ struct list_head m_list;
int m_count;
};

diff --cc fs/namei.c
index 88339f59efb5,384fcc6a5606..0e438b395e28
--- a/fs/namei.c
+++ b/fs/namei.c
@@@ -4082,33 -4045,17 +4085,33 @@@ int vfs_rename(struct inode *old_dir, s
if (error)
return error;

+ old_name = fsnotify_oldname_init(old_dentry->d_name.name);
dget(new_dentry);
- lock_two_nondirectories(source, target);
+ if (!is_dir || (flags & RENAME_EXCHANGE))
+ lock_two_nondirectories(source, target);
+ else if (target)
+ mutex_lock(&target->i_mutex);

error = -EBUSY;
- if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
+ if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
goto out;

- error = try_break_deleg(source, delegated_inode);
- if (error)
- goto out;
- if (target) {
+ if (max_links && new_dir != old_dir) {
+ error = -EMLINK;
+ if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
+ goto out;
+ if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
+ old_dir->i_nlink >= max_links)
+ goto out;
+ }
+ if (is_dir && !(flags & RENAME_EXCHANGE) && target)
+ shrink_dcache_parent(new_dentry);
+ if (!is_dir) {
+ error = try_break_deleg(source, delegated_inode);
+ if (error)
+ goto out;
+ }
+ if (target && !new_is_dir) {
error = try_break_deleg(target, delegated_inode);
if (error)
goto out;
@@@ -4123,31 -4064,73 +4126,32 @@@
if (error)
goto out;

- if (target) {
+ if (!(flags & RENAME_EXCHANGE) && target) {
+ if (is_dir)
+ target->i_flags |= S_DEAD;
dont_mount(new_dentry);
+ detach_mounts(new_dentry);
}
- if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
- d_move(old_dentry, new_dentry);
+ if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
+ if (!(flags & RENAME_EXCHANGE))
+ d_move(old_dentry, new_dentry);
+ else
+ d_exchange(old_dentry, new_dentry);
+ }
out:
- unlock_two_nondirectories(source, target);
+ if (!is_dir || (flags & RENAME_EXCHANGE))
+ unlock_two_nondirectories(source, target);
+ else if (target)
+ mutex_unlock(&target->i_mutex);
dput(new_dentry);
- return error;
-}
-
-/**
- * vfs_rename - rename a filesystem object
- * @old_dir: parent of source
- * @old_dentry: source
- * @new_dir: parent of destination
- * @new_dentry: destination
- * @delegated_inode: returns an inode needing a delegation break
- *
- * The caller must hold multiple mutexes--see lock_rename()).
- *
- * If vfs_rename discovers a delegation in need of breaking at either
- * the source or destination, it will return -EWOULDBLOCK and return a
- * reference to the inode in delegated_inode. The caller should then
- * break the delegation and retry. Because breaking a delegation may
- * take a long time, the caller should drop all locks before doing
- * so.
- *
- * Alternatively, a caller may pass NULL for delegated_inode. This may
- * be appropriate for callers that expect the underlying filesystem not
- * to be NFS exported.
- */
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- struct inode **delegated_inode)
-{
- int error;
- int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry);
- const unsigned char *old_name;
-
- if (old_dentry->d_inode == new_dentry->d_inode)
- return 0;
-
- error = may_delete(old_dir, old_dentry, is_dir);
- if (error)
- return error;
-
- if (!new_dentry->d_inode)
- error = may_create(new_dir, new_dentry);
- else
- error = may_delete(new_dir, new_dentry, is_dir);
- if (error)
- return error;
-
- if (!old_dir->i_op->rename)
- return -EPERM;
-
- old_name = fsnotify_oldname_init(old_dentry->d_name.name);
-
- if (is_dir)
- error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
- else
- error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode);
- if (!error)
+ if (!error) {
fsnotify_move(old_dir, new_dir, old_name, is_dir,
- new_dentry->d_inode, old_dentry);
+ !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
+ if (flags & RENAME_EXCHANGE) {
+ fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
+ new_is_dir, NULL, new_dentry);
+ }
+ }
fsnotify_oldname_free(old_name);

return error;
diff --cc fs/namespace.c
index 2ffc5a2905d4,52f4174e294c..c809205f30df
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@@ -665,13 -632,47 +666,47 @@@ struct vfsmount *lookup_mnt(struct pat
return m;
}

- static struct mountpoint *new_mountpoint(struct dentry *dentry)
+ /*
+ * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
+ * current mount namespace.
+ *
+ * The common case is dentries are not mountpoints at all and that
+ * test is handled inline. For the slow case when we are actually
+ * dealing with a mountpoint of some kind, walk through all of the
+ * mounts in the current mount namespace and test to see if the dentry
+ * is a mountpoint.
+ *
+ * The mount_hashtable is not usable in the context because we
+ * need to identify all mounts that may be in the current mount
+ * namespace not just a mount that happens to have some specified
+ * parent mount.
+ */
+ bool __is_local_mountpoint(struct dentry *dentry)
+ {
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ struct mount *mnt;
+ bool is_covered = false;
+
+ if (!d_mountpoint(dentry))
+ goto out;
+
+ down_read(&namespace_sem);
+ list_for_each_entry(mnt, &ns->list, mnt_list) {
+ is_covered = (mnt->mnt_mountpoint == dentry);
+ if (is_covered)
+ break;
+ }
+ up_read(&namespace_sem);
+ out:
+ return is_covered;
+ }
+
+ static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
{
- struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
+ struct hlist_head *chain = mp_hash(dentry);
struct mountpoint *mp;
- int ret;

- list_for_each_entry(mp, chain, m_hash) {
+ hlist_for_each_entry(mp, chain, m_hash) {
if (mp->m_dentry == dentry) {
/* might be worth a WARN_ON() */
if (d_unlinked(dentry))
@@@ -680,6 -681,14 +715,14 @@@
return mp;
}
}
+ return NULL;
+ }
+
+ static struct mountpoint *new_mountpoint(struct dentry *dentry)
+ {
- struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
++ struct hlist_head *chain = mp_hash(dentry);
+ struct mountpoint *mp;
+ int ret;

mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
if (!mp)
@@@ -693,7 -702,8 +736,8 @@@

mp->m_dentry = dentry;
mp->m_count = 1;
- list_add(&mp->m_hash, chain);
+ hlist_add_head(&mp->m_hash, chain);
+ INIT_LIST_HEAD(&mp->m_list);
return mp;
}

@@@ -746,7 -757,8 +791,8 @@@ static void detach_mnt(struct mount *mn
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
- list_del_init(&mnt->mnt_hash);
+ hlist_del_init_rcu(&mnt->mnt_hash);
+ list_del_init(&mnt->mnt_mp_list);
put_mountpoint(mnt->mnt_mp);
mnt->mnt_mp = NULL;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Luiz Capitulino: "Re: [PATCH 4/4] hugetlb: add support for gigantic page allocation at runtime"
Previous message: Jan Hubicka: "Re: [GIT] kbuild/lto changes for 3.15-rc1"
Next in thread: Al Viro: "Re: [GIT PULL] Detaching mounts on unlink for 3.15-rc1"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]