Re: [PATCH] thread wakeup fix for 2.4.0-test7

From: Tigran Aivazian (tigran@veritas.com)
Date: Tue Aug 29 2000 - 06:48:27 EST


On 27 Aug 2000, Linus Torvalds wrote:
> I'm basically saying: if you want reliable behaviour, you should make
> sure in user land that you don't close a file descriptor that is in use
          ~~~~~~~~~

> ...
> And I'm right. I'm always right, but in this case I'm just a bit more
> right than I usually am.
>
> Just don't close a file descriptor while it is in use. It's that simple.
>
> (I'm also claiming that Linux will give the most reasonable possible
> semantics for the case where you _do_ close the file desciptor when it
> is in use. I still claim that you shouldn't do it.)
>

Dear Linus,

I have read your email but would you care to say if you also object to
the idea of doing that in kernel land, please?

As you say, the kernel side is consistent and not-racey. So, is there
anything wrong with the idea of closing a file descriptor currently
(possibly) in use in the kernel, as required for the forced umount? This
is the safest (to the best of my knowledge) way of doing the forced umount
in the present Linux/VFS framework.

To make things more specific, I am talking about the patch below.

Regards,
Tigran

diff -urN -X dontdiff linux/fs/Makefile badfs/fs/Makefile
--- linux/fs/Makefile Thu Aug 10 06:51:11 2000
+++ badfs/fs/Makefile Tue Aug 15 10:27:09 2000
@@ -18,9 +18,9 @@
 ALL_SUB_DIRS := coda minix ext2 fat msdos vfat proc isofs nfs umsdos ntfs \
                 hpfs sysv smbfs ncpfs ufs efs affs romfs autofs hfs lockd \
                 nfsd nls devpts devfs adfs partitions qnx4 udf bfs cramfs \
- openpromfs autofs4 ramfs jffs
+ openpromfs autofs4 ramfs jffs badfs
 
-SUB_DIRS :=
+SUB_DIRS := badfs
 
 ifeq ($(CONFIG_QUOTA),y)
 O_OBJS += dquot.o
diff -urN -X dontdiff linux/fs/badfs/Makefile badfs/fs/badfs/Makefile
--- linux/fs/badfs/Makefile Thu Jan 1 01:00:00 1970
+++ badfs/fs/badfs/Makefile Tue Aug 15 10:27:09 2000
@@ -0,0 +1,9 @@
+#
+# Makefile for badfs filesystem.
+#
+
+O_TARGET := badfs.o
+O_OBJS := inode.o
+M_OBJS := $(O_TARGET)
+
+include $(TOPDIR)/Rules.make
diff -urN -X dontdiff linux/fs/badfs/inode.c badfs/fs/badfs/inode.c
--- linux/fs/badfs/inode.c Thu Jan 1 01:00:00 1970
+++ badfs/fs/badfs/inode.c Tue Aug 15 10:27:09 2000
@@ -0,0 +1,272 @@
+/*
+ * badfs - the Bad Filesystem
+ *
+ * Author - Tigran Aivazian <tigran@veritas.com>
+ *
+ * Thanks to:
+ * Manfred Spraul <manfred@colorfullife.com>, for useful comments.
+ *
+ * This file is released under the GPL.
+ *
+ * The badfs filesystem is used by forced umount ('umount -f' command)
+ * to move inodes that keep the filesystem being umounted busy to it.
+ *
+ * The entry point into this module is via quiesce_filesystem() called
+ * from fs/super.c:do_umount() if MNT_FORCE is passed.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+
+#define BADFS_MAGIC 0xBADF0001
+
+static struct super_block *badfs_read_super(struct super_block *,void *,int);
+
+#define FS_FLAGS_BADFS (FS_NOMOUNT | FS_SINGLE)
+static DECLARE_FSTYPE(badfs_fs_type,"badfs",badfs_read_super,FS_FLAGS_BADFS);
+
+static struct vfsmount *badfs_mnt; /* returned by kern_mount() */
+static struct super_block *badfs_sb; /* badfs_mnt->mnt_sb */
+static struct dentry *badfs_root; /* badfs_sb->s_root */
+
+static int __init init_badfs_fs(void)
+{
+ int err = register_filesystem(&badfs_fs_type);
+
+ if (!err) {
+ badfs_mnt = kern_mount(&badfs_fs_type);
+ if (IS_ERR(badfs_mnt)) {
+ err = PTR_ERR(badfs_mnt);
+ unregister_filesystem(&badfs_fs_type);
+ } else {
+ badfs_sb = badfs_mnt->mnt_sb;
+ err = 0;
+ }
+ }
+ return err;
+}
+
+module_init(init_badfs_fs);
+
+static struct inode *badfs_get_inode(struct super_block *sb, int mode)
+{
+ struct inode *inode = get_empty_inode();
+
+ if (inode) {
+ make_bad_inode(inode);
+ inode->i_sb = sb;
+ inode->i_dev = sb->s_dev;
+ inode->i_mode = mode;
+ inode->i_nlink = 1;
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ }
+ return inode;
+}
+
+/* VFS ->read_super() method */
+static struct super_block *badfs_read_super(struct super_block * sb,
+ void * data, int silent)
+{
+ static struct super_operations badfs_ops = {};
+ struct inode * root = badfs_get_inode(sb, S_IFDIR|S_IRUSR|S_IWUSR);
+
+ if (!root)
+ return NULL;
+ sb->s_blocksize = 1024;
+ sb->s_blocksize_bits = 10;
+ sb->s_magic = BADFS_MAGIC;
+ sb->s_op = &badfs_ops;
+ badfs_root = sb->s_root = d_alloc(NULL,
+ &(const struct qstr){ "bad:", 5, 0});
+ if (!badfs_root) {
+ iput(root);
+ return NULL;
+ }
+ sb->s_root->d_sb = sb;
+ sb->s_root->d_parent = sb->s_root;
+ d_instantiate(sb->s_root, root);
+ return sb;
+}
+
+static void disable_pwd(struct fs_struct *fs)
+{
+ struct inode *inode;
+ struct dentry *dentry;
+
+ inode = badfs_get_inode(badfs_sb, S_IFDIR|0755);
+ if (!inode) {
+ printk(KERN_ERR "disable_pwd(): can't allocate inode\n");
+ return;
+ }
+ dentry = d_alloc(badfs_root, &(const struct qstr){"dead_pwd", 8, 0});
+ if (!dentry) {
+ iput(inode);
+ printk(KERN_ERR "disable_pwd(): can't allocate dentry\n");
+ return;
+ }
+ d_instantiate(dentry, inode);
+ dget(dentry);
+ set_fs_pwd(fs, badfs_mnt, dentry);
+}
+
+static void disable_root(struct fs_struct *fs)
+{
+ struct inode *inode;
+ struct dentry *dentry;
+
+ inode = badfs_get_inode(badfs_sb, S_IFDIR|0755);
+ if (!inode) {
+ printk(KERN_ERR "disable_root(): can't allocate inode\n");
+ return;
+ }
+ dentry = d_alloc(badfs_root, &(const struct qstr){"dead_root", 9, 0});
+ if (!dentry) {
+ iput(inode);
+ printk(KERN_ERR "disable_root(): can't allocate dentry\n");
+ return;
+ }
+ d_instantiate(dentry, inode);
+ dget(dentry);
+ set_fs_root(fs, badfs_mnt, dentry);
+}
+
+/* called from do_umount() if MNT_FORCE is specified */
+void quiesce_filesystem(struct vfsmount *mnt)
+{
+ struct task_struct *p;
+ struct file *file;
+ struct inode *inode;
+
+ /* we do three passes through the task list, examining:
+ * 1. p->fs{->pwd,root} that can keep this mnt busy
+ * 2. p->files, i.e. open files (we do_close them)
+ * 3. p->mm, i.e. mmaped files (we simply do_munmap them)
+ * There is no guarantee that by the time we restart the loop
+ * the amount of work to do in the loop has not increased.
+ */
+repeat1:
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ struct fs_struct *fs;
+
+ /* get a reference to p->fs */
+ task_lock(p);
+ fs = p->fs;
+ if (!fs) {
+ task_unlock(p);
+ continue;
+ } else
+ atomic_inc(&fs->count);
+ task_unlock(p);
+
+ if (fs->pwdmnt == mnt) {
+ read_unlock(&tasklist_lock);
+ disable_pwd(fs); /* may sleep */
+ put_fs_struct(fs);
+ goto repeat1;
+ }
+ if (fs->rootmnt == mnt) {
+ read_unlock(&tasklist_lock);
+ disable_root(fs); /* may sleep */
+ put_fs_struct(fs);
+ goto repeat1;
+ }
+ put_fs_struct(fs);
+ }
+ read_unlock(&tasklist_lock);
+
+repeat2:
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ unsigned int fd, j = 0;
+ struct files_struct *files;
+
+ /* get a reference to p->files */
+ task_lock(p);
+ files = p->files;
+ if (!files) {
+ task_unlock(p);
+ continue;
+ } else {
+ atomic_inc(&files->count);
+ write_lock(&files->file_lock);
+ }
+ task_unlock(p);
+
+ /* check if this process has open files here */
+ while (1) {
+ unsigned long set;
+
+ fd = j * __NFDBITS;
+ if (fd >= files->max_fdset || fd >= files->max_fds)
+ break;
+ set = files->open_fds->fds_bits[j++];
+ while (set) {
+ if (set & 1) {
+ file = files->fd[fd];
+ inode = file->f_dentry->d_inode;
+ if (inode && (file->f_vfsmnt==mnt)) {
+ files->fd[fd] = NULL;
+ FD_CLR(fd, files->close_on_exec);
+ __put_unused_fd(files, fd);
+ write_unlock(&files->file_lock);
+ read_unlock(&tasklist_lock);
+ put_files_struct(files);
+ filp_close(file, files);
+ goto repeat2;
+ }
+ }
+ fd++;
+ set >>= 1;
+ }
+ }
+ write_unlock(&files->file_lock);
+ put_files_struct(files);
+ }
+ read_unlock(&tasklist_lock);
+
+repeat3:
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+
+ /* get a reference to p->mm */
+ task_lock(p);
+ mm = p->mm;
+ if (!mm) {
+ task_unlock(p);
+ continue;
+ } else
+ atomic_inc(&mm->mm_users);
+ task_unlock(p);
+
+ /* check for mmap'd files and unmap them */
+ vmlist_modify_lock(mm);
+ for (vma = mm->mmap; vma; vma=vma->vm_next) {
+ file = vma->vm_file;
+ if (!file)
+ continue;
+ inode = file->f_dentry->d_inode;
+ if (!inode || !inode->i_sb)
+ continue;
+ if (file->f_vfsmnt == mnt) {
+ vmlist_modify_unlock(mm);
+ read_unlock(&tasklist_lock);
+ down(&mm->mmap_sem);
+ do_munmap(mm, vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ up(&mm->mmap_sem);
+ mmput(mm);
+ goto repeat3;
+ }
+ }
+ vmlist_modify_unlock(mm);
+ mmput(mm);
+ }
+ read_unlock(&tasklist_lock);
+}
diff -urN -X dontdiff linux/fs/super.c badfs/fs/super.c
--- linux/fs/super.c Tue Aug 15 10:25:23 2000
+++ badfs/fs/super.c Tue Aug 15 10:27:09 2000
@@ -1033,6 +1033,9 @@
                 return retval;
         }
 
+ if (flags&MNT_FORCE)
+ quiesce_filesystem(mnt);
+
         spin_lock(&dcache_lock);
         if (atomic_read(&mnt->mnt_count) > 2) {
                 spin_unlock(&dcache_lock);
diff -urN -X dontdiff linux/include/linux/fs.h badfs/include/linux/fs.h
--- linux/include/linux/fs.h Tue Aug 15 10:25:23 2000
+++ badfs/include/linux/fs.h Tue Aug 15 10:37:49 2000
@@ -1189,6 +1189,8 @@
 extern kdev_t ROOT_DEV;
 extern char root_device_name[];
 
+/* fs/badfs/inode.c - used by forced umount */
+extern void quiesce_filesystem(struct vfsmount *);
 
 extern void show_buffers(void);
 extern void mount_root(void);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Thu Aug 31 2000 - 21:00:23 EST