Re: [RFC] proc: report open files as size in stat() for /proc/pid/fd

From: Ivan Babrou
Date: Sat Sep 17 2022 - 14:32:23 EST


> > > * Make fd count acces O(1) and expose it in /proc/pid/status
>
> This is doable, next to FDSize.

It feels like a better solution, but maybe I'm missing some context
here. Let me know whether this is preferred.

That said, I've tried doing it, but failed. There's a noticeable
mismatch in the numbers:

* systemd:

ivan@vm:~$ sudo ls -l /proc/1/fd | wc -l
66
ivan@vm:~$ cat /proc/1/status | fgrep FD
FDSize: 256
FDUsed: 71

* journald:

ivan@vm:~$ sudo ls -l /proc/803/fd | wc -l
29
ivan@vm:~$ cat /proc/803/status | fgrep FD
FDSize: 128
FDUsed: 37

I'll see if I can make it work next week. I'm happy to receive tips as well.

Below is my attempt (link in case gmail breaks patch formatting):

* https://gist.githubusercontent.com/bobrik/acce40881d629d8cce2e55966b31a0a2/raw/716eb4724a8fe3afeeb76fd2a7a47ee13790a9e9/fdused.patch

diff --git a/fs/file.c b/fs/file.c
index 3bcc1ecc314a..8bc0741cabf1 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -85,6 +85,8 @@ static void copy_fdtable(struct fdtable *nfdt,
struct fdtable *ofdt)
memset((char *)nfdt->fd + cpy, 0, set);

copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
+
+ atomic_set(&nfdt->count, atomic_read(&ofdt->count));
}

/*
@@ -105,6 +107,7 @@ static void copy_fdtable(struct fdtable *nfdt,
struct fdtable *ofdt)
static struct fdtable * alloc_fdtable(unsigned int nr)
{
struct fdtable *fdt;
+ atomic_t count = ATOMIC_INIT(0);
void *data;

/*
@@ -148,6 +151,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
fdt->close_on_exec = data;
data += nr / BITS_PER_BYTE;
fdt->full_fds_bits = data;
+ fdt->count = count;

return fdt;

@@ -399,6 +403,8 @@ struct files_struct *dup_fd(struct files_struct
*oldf, unsigned int max_fds, int
/* clear the remainder */
memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));

+ atomic_set(&new_fdt->count, atomic_read(&old_fdt->count));
+
rcu_assign_pointer(newf->fdt, new_fdt);

return newf;
@@ -474,6 +480,7 @@ struct files_struct init_files = {
.close_on_exec = init_files.close_on_exec_init,
.open_fds = init_files.open_fds_init,
.full_fds_bits = init_files.full_fds_bits_init,
+ .count = ATOMIC_INIT(0),
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
.resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
@@ -613,6 +620,7 @@ void fd_install(unsigned int fd, struct file *file)
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
+ atomic_inc(&fdt->count);
return;
}
/* coupled with smp_wmb() in expand_fdtable() */
@@ -621,6 +629,7 @@ void fd_install(unsigned int fd, struct file *file)
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
rcu_read_unlock_sched();
+ atomic_inc(&fdt->count);
}

EXPORT_SYMBOL(fd_install);
@@ -646,6 +655,7 @@ static struct file *pick_file(struct files_struct
*files, unsigned fd)
if (file) {
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
+ atomic_dec(&fdt->count);
}
return file;
}
@@ -844,6 +854,7 @@ void do_close_on_exec(struct files_struct *files)
filp_close(file, files);
cond_resched();
spin_lock(&files->file_lock);
+ atomic_dec(&fdt->count);
}

}
@@ -1108,6 +1119,7 @@ __releases(&files->file_lock)
else
__clear_close_on_exec(fd, fdt);
spin_unlock(&files->file_lock);
+ atomic_inc(&fdt->count);

if (tofree)
filp_close(tofree, files);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 99fcbfda8e25..5847f077bfc3 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -153,7 +153,8 @@ static inline void task_state(struct seq_file *m,
struct pid_namespace *ns,
struct task_struct *tracer;
const struct cred *cred;
pid_t ppid, tpid = 0, tgid, ngid;
- unsigned int max_fds = 0;
+ struct fdtable *fdt;
+ unsigned int max_fds = 0, open_fds = 0;

rcu_read_lock();
ppid = pid_alive(p) ?
@@ -170,8 +171,11 @@ static inline void task_state(struct seq_file *m,
struct pid_namespace *ns,
task_lock(p);
if (p->fs)
umask = p->fs->umask;
- if (p->files)
- max_fds = files_fdtable(p->files)->max_fds;
+ if (p->files) {
+ fdt = files_fdtable(p->files);
+ max_fds = fdt->max_fds;
+ open_fds = atomic_read(&fdt->count);
+ }
task_unlock(p);
rcu_read_unlock();

@@ -194,6 +198,7 @@ static inline void task_state(struct seq_file *m,
struct pid_namespace *ns,
seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid));
seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid));
seq_put_decimal_ull(m, "\nFDSize:\t", max_fds);
+ seq_put_decimal_ull(m, "\nFDUsed:\t", open_fds);

seq_puts(m, "\nGroups:\t");
group_info = cred->group_info;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..59aceb1e4bc6 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -31,6 +31,7 @@ struct fdtable {
unsigned long *open_fds;
unsigned long *full_fds_bits;
struct rcu_head rcu;
+ atomic_t count;
};

static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt)


> > > +
> > > + generic_fillattr(&init_user_ns, inode, stat);
> ^^^^^^^^^^^^^
>
> Is this correct? I'm not userns guy at all.

I mostly copied from here:

* https://elixir.bootlin.com/linux/v6.0-rc5/source/fs/proc/generic.c#L150

Maybe it can be simplified even further to match this one:

* https://elixir.bootlin.com/linux/v6.0-rc5/source/fs/proc/root.c#L317