[PATCH 13/13] vfs: Delete struct fdtable

From: Sandhya Bankar
Date: Sat Apr 29 2017 - 03:52:34 EST


Completing the conversion of the file descriptor allocation code to use
the IDR.

This patch includes below changes:

- Move max_fds from struct fdtable to files_struct.
- Added fill_max_fds() routine to calculate the new value of max_fds
to matches the old behaviour of alloc_fdtable() code which is
user-visible through /proc.
- Remove struct fdtable
- Removed resize_in_progress, resize_wait from files_struct
- Delete open_fds() & count_open_files()
- Use READ_ONCE() instead of rcu_read_lock/unlock(). The rcu_read_lock()/
unlock() was used to dereference the files->fdt. Now we don't use RCU to
protect files->max_fds. So using READ_ONCE() macro to read the value of
files->max_fds.

Signed-off-by: Sandhya Bankar <bankarsandhya512@xxxxxxxxx>
Signed-off-by: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
---
fs/compat.c | 6 +-
fs/file.c | 354 +++++++-----------------------------------------
fs/proc/array.c | 2 +-
fs/proc/fd.c | 2 +-
fs/select.c | 6 +-
include/linux/fdtable.h | 31 +----
6 files changed, 54 insertions(+), 347 deletions(-)

diff --git a/fs/compat.c b/fs/compat.c
index c61b506..7483c9c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1153,17 +1153,13 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp,
fd_set_bits fds;
void *bits;
int size, max_fds, ret = -EINVAL;
- struct fdtable *fdt;
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

if (n < 0)
goto out_nofds;

/* max_fds can increase, so grab it once to avoid race */
- rcu_read_lock();
- fdt = files_fdtable(current->files);
- max_fds = fdt->max_fds;
- rcu_read_unlock();
+ max_fds = READ_ONCE(current->files->max_fds);
if (n > max_fds)
n = max_fds;

diff --git a/fs/file.c b/fs/file.c
index 23f198b..3e6cf10 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -31,191 +31,36 @@
unsigned int sysctl_nr_open_max =
__const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;

-static void *alloc_fdmem(size_t size)
+static int fill_max_fds(struct files_struct *files, unsigned int nr)
{
- /*
- * Very large allocations can stress page reclaim, so fall back to
- * vmalloc() if the allocation size will be considered "large" by the VM.
- */
- if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
- __GFP_NOWARN | __GFP_NORETRY);
- if (data != NULL)
- return data;
- }
- return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
-}
+ unsigned int nr_open;

-static void __free_fdtable(struct fdtable *fdt)
-{
- kvfree(fdt->open_fds);
- kfree(fdt);
-}
-
-static void free_fdtable_rcu(struct rcu_head *rcu)
-{
- __free_fdtable(container_of(rcu, struct fdtable, rcu));
-}
-
-/*
- * Copy 'count' fd bits from the old table to the new table and clear the extra
- * space if any. This does not copy the file pointers. Called with the files
- * spinlock held for write.
- */
-static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
- unsigned int count)
-{
- unsigned int cpy, set;
-
- cpy = count / BITS_PER_BYTE;
- set = (nfdt->max_fds - count) / BITS_PER_BYTE;
- memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
- memset((char *)nfdt->open_fds + cpy, 0, set);
-}
+ if (likely(nr < files->max_fds))
+ return 0;

-/*
- * Copy all file descriptors from the old table to the new, expanded table and
- * clear the extra space. Called with the files spinlock held for write.
- */
-static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
-{
- BUG_ON(nfdt->max_fds < ofdt->max_fds);
- copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
-}
+ nr_open = READ_ONCE(sysctl_nr_open);

-static struct fdtable * alloc_fdtable(unsigned int nr)
-{
- struct fdtable *fdt;
- void *data;
+ if (nr >= nr_open)
+ return -EMFILE;

/*
- * Figure out how many fds we actually want to support in this fdtable.
- * Allocation steps are keyed to the size of the fdarray, since it
- * grows far faster than any of the other dynamic data. We try to fit
- * the fdarray into comfortable page-tuned chunks: starting at 1024B
- * and growing in powers of two from there on.
+ * This calculation of the new value of max_fds matches the old
+ * behaviour of this code which is user-visible through /proc.
+ * nr may exceed the sysctl_nr_open value by a small amount.
+ * This used to be necessary to keep the bitmaps aligned, and
+ * we keep it the same even though the IDR handles the bitmaps now.
*/
+
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
- /*
- * Note that this can drive nr *below* what we had passed if sysctl_nr_open
- * had been set lower between the check in expand_files() and here. Deal
- * with that in caller, it's cheaper that way.
- *
- * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
- * bitmaps handling below becomes unpleasant, to put it mildly...
- */
- if (unlikely(nr > sysctl_nr_open))
- nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;

- fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
- if (!fdt)
- goto out;
- fdt->max_fds = nr;
+ if (unlikely(nr > nr_open))
+ nr = ((nr_open - 1) | (BITS_PER_LONG - 1)) + 1;

- data = alloc_fdmem(max_t(size_t, nr / BITS_PER_BYTE, L1_CACHE_BYTES));
- if (!data)
- goto out_fdt;
- fdt->open_fds = data;
+ files->max_fds = nr;

- return fdt;
-
-out_fdt:
- kfree(fdt);
-out:
- return NULL;
-}
-
-/*
- * Expand the file descriptor table.
- * This function will allocate a new fdtable and both fd array and fdset, of
- * the given size.
- * Return <0 error code on error; 1 on successful completion.
- * The files->file_lock should be held on entry, and will be held on exit.
- */
-static int expand_fdtable(struct files_struct *files, unsigned int nr)
- __releases(files->file_lock)
- __acquires(files->file_lock)
-{
- struct fdtable *new_fdt, *cur_fdt;
-
- spin_unlock(&files->file_lock);
- idr_preload_end();
- new_fdt = alloc_fdtable(nr);
-
- /* make sure all __fd_install() have seen resize_in_progress
- * or have finished their rcu_read_lock_sched() section.
- */
- if (atomic_read(&files->count) > 1)
- synchronize_sched();
-
- idr_preload(GFP_KERNEL);
- spin_lock(&files->file_lock);
- if (!new_fdt)
- return -ENOMEM;
- /*
- * extremely unlikely race - sysctl_nr_open decreased between the check in
- * caller and alloc_fdtable(). Cheaper to catch it here...
- */
- if (unlikely(new_fdt->max_fds <= nr)) {
- __free_fdtable(new_fdt);
- return -EMFILE;
- }
- cur_fdt = files_fdtable(files);
- BUG_ON(nr < cur_fdt->max_fds);
- copy_fdtable(new_fdt, cur_fdt);
- rcu_assign_pointer(files->fdt, new_fdt);
- if (cur_fdt != &files->fdtab)
- call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
- /* coupled with smp_rmb() in __fd_install() */
- smp_wmb();
- return 1;
-}
-
-/*
- * Expand files.
- * This function will expand the file structures, if the requested size exceeds
- * the current capacity and there is room for expansion.
- * Return <0 error code on error; 0 when nothing done; 1 when files were
- * expanded and execution may have blocked.
- * The files->file_lock should be held on entry, and will be held on exit.
- */
-static int expand_files(struct files_struct *files, unsigned int nr)
- __releases(files->file_lock)
- __acquires(files->file_lock)
-{
- struct fdtable *fdt;
- int expanded = 0;
-
-repeat:
- fdt = files_fdtable(files);
-
- /* Do we need to expand? */
- if (nr < fdt->max_fds)
- return expanded;
-
- /* Can we expand? */
- if (nr >= sysctl_nr_open)
- return -EMFILE;
-
- if (unlikely(files->resize_in_progress)) {
- spin_unlock(&files->file_lock);
- idr_preload_end();
- expanded = 1;
- wait_event(files->resize_wait, !files->resize_in_progress);
- idr_preload(GFP_KERNEL);
- spin_lock(&files->file_lock);
- goto repeat;
- }
-
- /* All good, so we try */
- files->resize_in_progress = true;
- expanded = expand_fdtable(files, nr);
- files->resize_in_progress = false;
-
- wake_up_all(&files->resize_wait);
- return expanded;
+ return 0;
}

static inline bool fd_is_open(unsigned int fd, struct files_struct *files)
@@ -235,28 +80,21 @@ static inline void __clear_close_on_exec(unsigned int fd,
idr_tag_clear(&files->fd_idr, fd, FD_TAG_CLOEXEC);
}

-static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
-{
- __set_bit(fd, fdt->open_fds);
-}
-
-static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
-{
- __clear_bit(fd, fdt->open_fds);
-}
-
-static unsigned int count_open_files(struct fdtable *fdt)
+static void close_files(struct files_struct * files)
{
- unsigned int size = fdt->max_fds;
- unsigned int i;
+ /*
+ * No need for RCU or ->file_lock protection because
+ * this is the last reference to the files structure.
+ */
+ struct file *file;
+ int fd;

- /* Find the last open fd */
- for (i = size / BITS_PER_LONG; i > 0; ) {
- if (fdt->open_fds[--i])
- break;
+ idr_for_each_entry(&files->fd_idr, file, fd) {
+ filp_close(file, files);
+ cond_resched_rcu_qs();
}
- i = (i + 1) * BITS_PER_LONG;
- return i;
+
+ idr_destroy(&files->fd_idr);
}

/*
@@ -267,9 +105,8 @@ static unsigned int count_open_files(struct fdtable *fdt)
struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
struct files_struct *newf;
- unsigned int open_files, i;
+ unsigned int i;
struct file *f;
- struct fdtable *old_fdt, *new_fdt;

*errorp = -ENOMEM;
newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
@@ -280,51 +117,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)

spin_lock_init(&newf->file_lock);
idr_init(&newf->fd_idr);
- newf->resize_in_progress = false;
- init_waitqueue_head(&newf->resize_wait);
- new_fdt = &newf->fdtab;
- new_fdt->max_fds = NR_OPEN_DEFAULT;
- new_fdt->open_fds = newf->open_fds_init;
+ newf->max_fds = NR_OPEN_DEFAULT;

restart:
idr_copy_preload(&oldf->fd_idr, GFP_KERNEL);
spin_lock(&oldf->file_lock);
- old_fdt = files_fdtable(oldf);
- open_files = count_open_files(old_fdt);
-
- /*
- * Check whether we need to allocate a larger fd array and fd set.
- */
- while (unlikely(open_files > new_fdt->max_fds)) {
- spin_unlock(&oldf->file_lock);
- idr_preload_end();
-
- if (new_fdt != &newf->fdtab)
- __free_fdtable(new_fdt);
-
- new_fdt = alloc_fdtable(open_files - 1);
- if (!new_fdt) {
- *errorp = -ENOMEM;
- goto out_release;
- }
-
- /* beyond sysctl_nr_open; nothing to do */
- if (unlikely(new_fdt->max_fds < open_files)) {
- __free_fdtable(new_fdt);
- *errorp = -EMFILE;
- goto out_release;
- }
-
- /*
- * Reacquire the oldf lock and a pointer to its fd table
- * who knows it may have a new bigger fd table. We need
- * the latest pointer.
- */
- idr_copy_preload(&oldf->fd_idr, GFP_KERNEL);
- spin_lock(&oldf->file_lock);
- old_fdt = files_fdtable(oldf);
- open_files = count_open_files(old_fdt);
- }

if (!idr_check_preload(&oldf->fd_idr)) {
spin_unlock(&oldf->file_lock);
@@ -332,8 +129,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
goto restart;
}

- copy_fd_bitmaps(new_fdt, old_fdt, open_files);
-
idr_for_each_entry(&oldf->fd_idr, f, i) {
int err;

@@ -341,6 +136,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
err = idr_alloc(&newf->fd_idr, f, i, i + 1, GFP_NOWAIT);
if (WARN(err != i, "Could not allocate %d: %d", i, err)) {
spin_unlock(&oldf->file_lock);
+ idr_preload_end();
goto out;
}

@@ -351,42 +147,18 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
spin_unlock(&oldf->file_lock);
idr_preload_end();

- /*
- * The fd may be claimed in the fd bitmap but not yet
- * instantiated in the files array if a sibling thread
- * is partway through open().
- */
- for_each_set_bit(i, new_fdt->open_fds, new_fdt->max_fds) {
- if (!idr_find(&newf->fd_idr, i))
- __clear_bit(i, new_fdt->open_fds);
+ if (unlikely(fill_max_fds(newf, i) < 0)) {
+ *errorp = -EMFILE;
+ goto out;
}

- rcu_assign_pointer(newf->fdt, new_fdt);
-
return newf;

-out_release:
- idr_destroy(&newf->fd_idr);
- kmem_cache_free(files_cachep, newf);
out:
- return NULL;
-}
-
-static void close_files(struct files_struct * files)
-{
- /*
- * No need for RCU or ->file_lock protection because
- * this is the last reference to the files structure.
- */
- struct file *file;
- int fd;
-
- idr_for_each_entry(&files->fd_idr, file, fd) {
- filp_close(file, files);
- cond_resched_rcu_qs();
- }
+ close_files(newf);
+ kmem_cache_free(files_cachep, newf);

- idr_destroy(&files->fd_idr);
+ return NULL;
}

struct files_struct *get_files_struct(struct task_struct *task)
@@ -405,12 +177,7 @@ struct files_struct *get_files_struct(struct task_struct *task)
void put_files_struct(struct files_struct *files)
{
if (atomic_dec_and_test(&files->count)) {
- struct fdtable *fdt = rcu_dereference_raw(files->fdt);
close_files(files);
-
- /* free the arrays if they are not embedded */
- if (fdt != &files->fdtab)
- __free_fdtable(fdt);
kmem_cache_free(files_cachep, files);
}
}
@@ -441,11 +208,7 @@ void exit_files(struct task_struct *tsk)

struct files_struct init_files = {
.count = ATOMIC_INIT(1),
- .fdt = &init_files.fdtab,
- .fdtab = {
- .max_fds = NR_OPEN_DEFAULT,
- .open_fds = init_files.open_fds_init,
- },
+ .max_fds = NR_OPEN_DEFAULT,
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
.fd_idr = IDR_INIT,
};
@@ -458,7 +221,6 @@ int __alloc_fd(struct files_struct *files,
{
unsigned int fd;
int error;
- struct fdtable *fdt;

idr_preload(GFP_KERNEL);
spin_lock(&files->file_lock);
@@ -471,14 +233,12 @@ int __alloc_fd(struct files_struct *files,
BUG_ON(error < 0);
fd = error;

- error = expand_files(files, fd);
+ error = fill_max_fds(files, fd);
if (error < 0) {
idr_remove(&files->fd_idr, fd);
goto out;
}

- fdt = files_fdtable(files);
- __set_open_fd(fd, fdt);
if (flags & O_CLOEXEC)
__set_close_on_exec(fd, files);
else
@@ -502,18 +262,11 @@ int get_unused_fd_flags(unsigned flags)
}
EXPORT_SYMBOL(get_unused_fd_flags);

-static void __put_unused_fd(struct files_struct *files, unsigned int fd)
-{
- struct fdtable *fdt = files_fdtable(files);
- __clear_open_fd(fd, fdt);
-}
-
void put_unused_fd(unsigned int fd)
{
struct files_struct *files = current->files;
spin_lock(&files->file_lock);
BUG_ON(idr_remove(&files->fd_idr, fd));
- __put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
}

@@ -560,17 +313,12 @@ void fd_install(unsigned int fd, struct file *file)
int __close_fd(struct files_struct *files, unsigned fd)
{
struct file *file;
- struct fdtable *fdt;

spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (fd >= fdt->max_fds)
- goto out_unlock;
file = idr_remove(&files->fd_idr, fd);
if (!file)
goto out_unlock;
__clear_close_on_exec(fd, files);
- __put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
return filp_close(file, files);

@@ -589,7 +337,6 @@ void do_close_on_exec(struct files_struct *files)

idr_for_each_entry_tagged(&files->fd_idr, file, fd, FD_TAG_CLOEXEC) {
idr_remove(&files->fd_idr, fd);
- __put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
filp_close(file, files);
cond_resched();
@@ -718,7 +465,6 @@ static int do_dup2(struct files_struct *files,
__releases(&files->file_lock)
{
struct file *tofree;
- struct fdtable *fdt;

/*
* We need to detect attempts to do dup2() over allocated but still
@@ -734,7 +480,6 @@ static int do_dup2(struct files_struct *files,
* scope of POSIX or SUS, since neither considers shared descriptor
* tables and this condition does not arise without those.
*/
- fdt = files_fdtable(files);
tofree = idr_find(&files->fd_idr, fd);
if (!tofree && fd_is_open(fd, files))
goto Ebusy;
@@ -749,7 +494,6 @@ static int do_dup2(struct files_struct *files,
goto Ebusy;
}
}
- __set_open_fd(fd, fdt);
if (flags & O_CLOEXEC)
__set_close_on_exec(fd, files);
else
@@ -781,9 +525,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)

idr_preload(GFP_KERNEL);
spin_lock(&files->file_lock);
- err = expand_files(files, fd);
+ err = fill_max_fds(files, fd);
if (unlikely(err < 0))
goto out_unlock;
+
return do_dup2(files, file, fd, flags);

out_unlock:
@@ -809,20 +554,18 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)

idr_preload(GFP_KERNEL);
spin_lock(&files->file_lock);
- err = expand_files(files, newfd);
+ err = fill_max_fds(files, newfd);
+ if (unlikely(err < 0))
+ goto Ebadf;
+
file = fcheck(oldfd);
if (unlikely(!file))
goto Ebadf;
- if (unlikely(err < 0)) {
- if (err == -EMFILE)
- goto Ebadf;
- goto out_unlock;
- }
+
return do_dup2(files, file, newfd, flags);

Ebadf:
err = -EBADF;
-out_unlock:
spin_unlock(&files->file_lock);
idr_preload_end();
return err;
@@ -875,12 +618,11 @@ int iterate_fd(struct files_struct *files, unsigned n,
int (*f)(const void *, struct file *, unsigned),
const void *p)
{
- struct fdtable *fdt;
int res = 0;
if (!files)
return 0;
spin_lock(&files->file_lock);
- for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
+ for (; n < files->max_fds; n++) {
struct file *file;
file = __fcheck_files(files, n);
if (!file)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 88c3555..ec6fdaf 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -186,7 +186,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,

task_lock(p);
if (p->files)
- max_fds = files_fdtable(p->files)->max_fds;
+ max_fds = READ_ONCE(p->files->max_fds);
task_unlock(p);
rcu_read_unlock();

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 2735ccc..7e5aeca 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -231,7 +231,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,

rcu_read_lock();
for (fd = ctx->pos - 2;
- fd < files_fdtable(files)->max_fds;
+ fd < files->max_fds;
fd++, ctx->pos++) {
char name[PROC_NUMBUF];
int len;
diff --git a/fs/select.c b/fs/select.c
index 5d20a14..14da393 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -557,7 +557,6 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
void *bits;
int ret, max_fds;
size_t size, alloc_size;
- struct fdtable *fdt;
/* Allocate small arguments on the stack to save memory and be faster */
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

@@ -566,10 +565,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
goto out_nofds;

/* max_fds can increase, so grab it once to avoid race */
- rcu_read_lock();
- fdt = files_fdtable(current->files);
- max_fds = fdt->max_fds;
- rcu_read_unlock();
+ max_fds = READ_ONCE(current->files->max_fds);
if (n > max_fds)
n = max_fds;

diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 7f1ab82..ff94541 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -24,43 +24,16 @@

#define FD_TAG_CLOEXEC 1

-struct fdtable {
- unsigned int max_fds;
- unsigned long *open_fds;
- struct rcu_head rcu;
-};
-
/*
* Open file table structure
*/
struct files_struct {
- /*
- * read mostly part
- */
+ spinlock_t file_lock;
atomic_t count;
- bool resize_in_progress;
- wait_queue_head_t resize_wait;
-
struct idr fd_idr;
- struct fdtable __rcu *fdt;
- struct fdtable fdtab;
- /*
- * written part on a separate cache line in SMP
- */
- spinlock_t file_lock ____cacheline_aligned_in_smp;
- unsigned long open_fds_init[1];
+ unsigned int max_fds;
};

-struct file_operations;
-struct vfsmount;
-struct dentry;
-
-#define rcu_dereference_check_fdtable(files, fdtfd) \
- rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))
-
-#define files_fdtable(files) \
- rcu_dereference_check_fdtable((files), (files)->fdt)
-
/*
* The caller must ensure that fd table isn't shared or hold rcu or file lock
*/
--
1.8.3.1