Re: [PATCH 1/1] pgo: Fix sleep in atomic section in prf_open()

From: Kees Cook
Date: Wed Jun 02 2021 - 13:32:32 EST


On Wed, Jun 02, 2021 at 07:26:40PM +0300, Jarmo Tiitto wrote:
> In prf_open() the required buffer size can be so large that
> vzalloc() may sleep thus triggering bug:
>
> ======
> BUG: sleeping function called from invalid context at include/linux/sched/mm.h:201
> in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 337, name: cat
> CPU: 1 PID: 337 Comm: cat Not tainted 5.13.0-rc2-24-hack+ #154
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> Call Trace:
> dump_stack+0xc7/0x134
> ___might_sleep+0x177/0x190
> __might_sleep+0x5a/0x90
> kmem_cache_alloc_node_trace+0x6b/0x3a0
> ? __get_vm_area_node+0xcd/0x1b0
> ? dput+0x283/0x300
> __get_vm_area_node+0xcd/0x1b0
> __vmalloc_node_range+0x7b/0x420
> ? prf_open+0x1da/0x580
> ? prf_open+0x32/0x580
> ? __llvm_profile_instrument_memop+0x36/0x50
> vzalloc+0x54/0x60
> ? prf_open+0x1da/0x580
> prf_open+0x1da/0x580
> full_proxy_open+0x211/0x370
> ....
> ======

Ah-ha; nice catch!

>
> This patch avoids holding the prf_lock() while calling
> vzalloc(). Problem with that is prf_buffer_size()
> *must* be called with prf_lock() held and the buffer
> size may change while we call vzalloc()
>
> So first get buffer size, release the lock and allocate.
> Then re-lock and call prf_serialize() that now checks if
> the buffer is big enough. If not, the code loops.
>
> Signed-off-by: Jarmo Tiitto <jarmo.tiitto@xxxxxxxxx>
> ---
> kernel/pgo/fs.c | 45 +++++++++++++++++++++++++++++++++++----------
> 1 file changed, 35 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/pgo/fs.c b/kernel/pgo/fs.c
> index ef985159dad3..e8ac07637423 100644
> --- a/kernel/pgo/fs.c
> +++ b/kernel/pgo/fs.c
> @@ -227,16 +227,15 @@ static unsigned long prf_buffer_size(void)
> * Serialize the profiling data into a format LLVM's tools can understand.
> * Note: caller *must* hold pgo_lock.
> */
> -static int prf_serialize(struct prf_private_data *p)
> +static int prf_serialize(struct prf_private_data *p, unsigned long *buf_size)
> {
> int err = 0;
> void *buffer;
>
> - p->size = prf_buffer_size();
> - p->buffer = vzalloc(p->size);
> + *buf_size = prf_buffer_size();
>
> - if (!p->buffer) {
> - err = -ENOMEM;
> + if (p->size < *bufsize) {

Nit: please change prf_private_data::size to size_t while you're
touching this code.

> + err = -EAGAIN;
> goto out;
> }
>
> @@ -259,6 +258,7 @@ static int prf_open(struct inode *inode, struct file *file)
> {
> struct prf_private_data *data;
> unsigned long flags;
> + unsigned long buf_size;
> int err;
>
> data = kzalloc(sizeof(*data), GFP_KERNEL);
> @@ -267,14 +267,39 @@ static int prf_open(struct inode *inode, struct file *file)
> goto out;
> }
>
> + /* note: vzalloc() can be used in atomic section.
> + * However to get the buffer size prf_lock() *must*
> + * be taken. So take lock, get buffer size, release
> + * the lock and allocate.
> + * prf_serialize() then checks if buffer has enough space.
> + */
> flags = prf_lock();
> + buf_size = prf_buffer_size();
>
> - err = prf_serialize(data);
> - if (unlikely(err)) {
> - kfree(data);
> - goto out_unlock;
> - }
> + do {
> + prf_unlock(flags);
> +
> + /* resize buffer */
> + if (data->size < buf_size && data->buffer) {
> + vfree(data->buffer);
> + data->buffer = NULL;
> + }
> +
> + if (!data->buffer) {
> + data->size = buf_size;
> + data->buffer = vzalloc(data->size);
> +
> + if (!data->buffer) {
> + err = -ENOMEM;
> + kfree(data);
> + goto out;
> + }
> + }
> + /* try serialize */
> + flags = prf_lock();
> + } while (prf_serialize(data, &buf_size));

I'm not a fan of loops where it's hard to answer the question "how do we
know this loop will always terminate?"

Given that vmalloc allocates PAGE_SIZE-granular regions, how about
rounding up to likely avoid multiple passes and put the growth explicitly
in the loop, rather than just looking at "any" prf_serialize() failure.

e.g.:

struct prf_private_data *data;
int err = -ENOMEM;

data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
goto out_free;

do {
unsigned long flags;
size_t size;

size = PAGE_ALIGN(prf_buffer_size());
/* Required buffer size must be growing with each loop. */
if (WARN_ON_ONCE(size <= data->size)) {
err = -ENOMEM;
goto out_free;
}

if (data->buf)
vfree(data->buf);
data->buf = vzalloc(size);
if (!data->buf) {
err = -ENOMEM;
goto out_free;
}
data->size = size;

flags = prf_lock();
err = prf_serialize(data);
prf_unlock(flags);
} while (err == -EAGAIN);

if (err)
goto out_free;

file->private_data = data;
return 0;

out_free:
if (data)
vfree(data->buf);
kfree(data);
return err;


>
> + data->size = buf_size;
> file->private_data = data;
>
> out_unlock:
>
> base-commit: e1af496cbe9b4517428601a4e44fee3602dd3c15
> prerequisite-patch-id: fccc1bd89bbd33af13a4ce9bc3c913e6e3cdecee
> prerequisite-patch-id: a2e53c0b44ad39c78ed7bc7aad40d133548a13b5
> prerequisite-patch-id: 12f0e468a3d0ff12c7f5bc640f213be3b5dd261b
> prerequisite-patch-id: 707b836b1969958b5131dfa1b9f044eae5f4a76a
> --
> 2.31.1
>

--
Kees Cook