Re: [PATCH RFC v2 1/3] pidfs: rework inode number allocation

From: Jan Kara
Date: Mon Dec 02 2024 - 10:21:35 EST


On Fri 29-11-24 14:02:23, Christian Brauner wrote:
> Recently we received a patchset that aims to enable file handle encoding
> and decoding via name_to_handle_at(2) and open_by_handle_at(2).
>
> A crucical step in the patch series is how to go from inode number to
> struct pid without leaking information into unprivileged contexts. The
> issue is that in order to find a struct pid the pid number in the
> initial pid namespace must be encoded into the file handle via
> name_to_handle_at(2). This can be used by containers using a separate
> pid namespace to learn what the pid number of a given process in the
> initial pid namespace is. While this is a weak information leak it could
> be used in various exploits and in general is an ugly wart in the design.
>
> To solve this problem a new way is needed to lookup a struct pid based
> on the inode number allocated for that struct pid. The other part is to
> remove the custom inode number allocation on 32bit systems that is also
> an ugly wart that should go away.
>
> So, a new scheme is used that I was discusssing with Tejun some time
> back. A cyclic ida is used for the lower 32 bits and a the high 32 bits
> are used for the generation number. This gives a 64 bit inode number
> that is unique on both 32 bit and 64 bit. The lower 32 bit number is
> recycled slowly and can be used to lookup struct pids.
>
> Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>

Looks good to me. Feel free to add:

Reviewed-by: Jan Kara <jack@xxxxxxx>

Honza

> ---
> fs/pidfs.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/pidfs.h | 2 ++
> kernel/pid.c | 14 ++++++------
> 3 files changed, 72 insertions(+), 7 deletions(-)
>
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 618abb1fa1b84cf31282c922374e28d60cd49d00..0bdd9c525b80895d33f2eae5e8e375788580072f 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -23,6 +23,59 @@
> #include "internal.h"
> #include "mount.h"
>
> +static u32 pidfs_ino_highbits;
> +static u32 pidfs_ino_last_ino_lowbits;
> +
> +static DEFINE_IDR(pidfs_ino_idr);
> +
> +static inline ino_t pidfs_ino(u64 ino)
> +{
> + /* On 32 bit low 32 bits are the inode. */
> + if (sizeof(ino_t) < sizeof(u64))
> + return (u32)ino;
> +
> + /* On 64 bit simply return ino. */
> + return ino;
> +}
> +
> +static inline u32 pidfs_gen(u64 ino)
> +{
> + /* On 32 bit the generation number are the upper 32 bits. */
> + if (sizeof(ino_t) < sizeof(u64))
> + return ino >> 32;
> +
> + /* On 64 bit the generation number is 1. */
> + return 1;
> +}
> +
> +/*
> + * Construct an inode number for struct pid in a way that we can use the
> + * lower 32bit to lookup struct pid independent of any pid numbers that
> + * could be leaked into userspace (e.g., via file handle encoding).
> + */
> +int pidfs_add_pid(struct pid *pid)
> +{
> + u32 ino_highbits;
> + int ret;
> +
> + ret = idr_alloc_cyclic(&pidfs_ino_idr, pid, 1, 0, GFP_ATOMIC);
> + if (ret >= 0 && ret < pidfs_ino_last_ino_lowbits)
> + pidfs_ino_highbits++;
> + ino_highbits = pidfs_ino_highbits;
> + pidfs_ino_last_ino_lowbits = ret;
> + if (ret < 0)
> + return ret;
> +
> + pid->ino = (u64)ino_highbits << 32 | ret;
> + pid->stashed = NULL;
> + return 0;
> +}
> +
> +void pidfs_remove_pid(struct pid *pid)
> +{
> + idr_remove(&pidfs_ino_idr, (u32)pidfs_ino(pid->ino));
> +}
> +
> #ifdef CONFIG_PROC_FS
> /**
> * pidfd_show_fdinfo - print information about a pidfd
> @@ -491,6 +544,16 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
>
> void __init pidfs_init(void)
> {
> + /*
> + * On 32 bit systems the lower 32 bits are the inode number and
> + * the higher 32 bits are the generation number. The starting
> + * value for the inode number and the generation number is one.
> + */
> + if (sizeof(ino_t) < sizeof(u64))
> + pidfs_ino_highbits = 1;
> + else
> + pidfs_ino_highbits = 0;
> +
> pidfs_mnt = kern_mount(&pidfs_type);
> if (IS_ERR(pidfs_mnt))
> panic("Failed to mount pidfs pseudo filesystem");
> diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
> index 75bdf9807802a5d1a9699c99aa42648c2bd34170..2958652bb108b8a2e02128e17317be4545b40a01 100644
> --- a/include/linux/pidfs.h
> +++ b/include/linux/pidfs.h
> @@ -4,5 +4,7 @@
>
> struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
> void __init pidfs_init(void);
> +int pidfs_add_pid(struct pid *pid);
> +void pidfs_remove_pid(struct pid *pid);
>
> #endif /* _LINUX_PID_FS_H */
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 115448e89c3e9e664d0d51c8d853e8167ba0540c..6131543e7c090c164a2bac014f8eeee61926b13d 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -64,11 +64,6 @@ int pid_max = PID_MAX_DEFAULT;
>
> int pid_max_min = RESERVED_PIDS + 1;
> int pid_max_max = PID_MAX_LIMIT;
> -/*
> - * Pseudo filesystems start inode numbering after one. We use Reserved
> - * PIDs as a natural offset.
> - */
> -static u64 pidfs_ino = RESERVED_PIDS;
>
> /*
> * PID-map pages start out as NULL, they get allocated upon
> @@ -157,6 +152,7 @@ void free_pid(struct pid *pid)
> }
>
> idr_remove(&ns->idr, upid->nr);
> + pidfs_remove_pid(pid);
> }
> spin_unlock_irqrestore(&pidmap_lock, flags);
>
> @@ -273,22 +269,26 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> INIT_HLIST_HEAD(&pid->inodes);
>
> upid = pid->numbers + ns->level;
> + idr_preload(GFP_KERNEL);
> spin_lock_irq(&pidmap_lock);
> if (!(ns->pid_allocated & PIDNS_ADDING))
> goto out_unlock;
> - pid->stashed = NULL;
> - pid->ino = ++pidfs_ino;
> + retval = pidfs_add_pid(pid);
> + if (retval)
> + goto out_unlock;
> for ( ; upid >= pid->numbers; --upid) {
> /* Make the PID visible to find_pid_ns. */
> idr_replace(&upid->ns->idr, pid, upid->nr);
> upid->ns->pid_allocated++;
> }
> spin_unlock_irq(&pidmap_lock);
> + idr_preload_end();
>
> return pid;
>
> out_unlock:
> spin_unlock_irq(&pidmap_lock);
> + idr_preload_end();
> put_pid_ns(ns);
>
> out_free:
>
> --
> 2.45.2
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR