WARNING: CPU: 1 PID: 52 at mm/page_alloc.c:4826 __alloc_pages_nodemask (Re: [PATCH 5/5] sysctl: pass kernel pointers to ->proc_handler)

From: Vegard Nossum
Date: Thu Jun 04 2020 - 16:22:57 EST



(Trimmed original Ccs due to outgoing email policy.)

Hi,

On 2020-04-24 08:43, Christoph Hellwig wrote:
Instead of having all the sysctl handlers deal with user pointers, which
is rather hairy in terms of the BPF interaction, copy the input to and
from userspace in common code. This also means that the strings are
always NUL-terminated by the common code, making the API a little bit
safer.

As most handler just pass through the data to one of the common handlers
a lot of the changes are mechnical.

Signed-off-by: Christoph Hellwig <hch@xxxxxx>
Acked-by: Andrey Ignatov <rdna@xxxxxx>

[snip]
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b6f5d459b087d..df2143e05c571 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -539,13 +539,13 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
return err;
}
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
+static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
size_t count, loff_t *ppos, int write)
{
struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
- void *new_buf = NULL;
+ void *kbuf;
ssize_t error;
if (IS_ERR(head))
@@ -564,27 +564,38 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
if (!table->proc_handler)
goto out;
- error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count,
- ppos, &new_buf);
+ if (write) {
+ kbuf = memdup_user_nul(ubuf, count);
+ if (IS_ERR(kbuf)) {
+ error = PTR_ERR(kbuf);
+ goto out;
+ }
+ } else {
+ error = -ENOMEM;
+ kbuf = kzalloc(count, GFP_KERNEL);
+ if (!kbuf)
+ goto out;
+ }
+
+ error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
+ ppos);
if (error)
- goto out;
+ goto out_free_buf;
/* careful: calling conventions are nasty here */
- if (new_buf) {
- mm_segment_t old_fs;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- error = table->proc_handler(table, write, (void __user *)new_buf,
- &count, ppos);
- set_fs(old_fs);
- kfree(new_buf);
- } else {
- error = table->proc_handler(table, write, buf, &count, ppos);
+ error = table->proc_handler(table, write, kbuf, &count, ppos);
+ if (error)
+ goto out_free_buf;
+
+ if (!write) {
+ error = -EFAULT;
+ if (copy_to_user(ubuf, kbuf, count))
+ goto out_free_buf;
}
- if (!error)
- error = count;
+ error = count;
+out_free_buf:
+ kfree(kbuf);
out:
sysctl_head_finish(head);

This commit in recent linus/master
(32927393dc1ccd60fb2bdc05b9e8e88753761469) causes a regression for me:

------------[ cut here ]------------
WARNING: CPU: 1 PID: 52 at mm/page_alloc.c:4826 __alloc_pages_nodemask+0x1cd/0x2a0
CPU: 1 PID: 52 Comm: init Not tainted 5.7.0+ #218
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
RIP: 0010:__alloc_pages_nodemask+0x1cd/0x2a0
Code: 0f 85 26 ff ff ff 65 48 8b 04 25 00 7d 01 00 48 05 88 07 00 00 41 bd 01 00 00 00 48 89 44 24 08 e9 07 ff ff ff 80 e7 20 75 02 <0f> 0b 45 31 ed eb 98 44 8b 64 24 18 65 8b 05 d0 25 e9 7e 89 c0 48
RSP: 0018:ffffc900000e7de0 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 00000000000400c0 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000013 RDI: 0000000000040dc0
RBP: 000000007ffff000 R08: ffffffff820276c0 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffffc900000e7f08
R13: 0000000000000013 R14: 0000000000000013 R15: ffffffff81c34ce0
FS: 00000000006cf880(0000) GS:ffff88803ed00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004a1dab CR3: 000000003e012002 CR4: 00000000003606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
kmalloc_order+0x16/0x70
kmalloc_order_trace+0x18/0xa0
proc_sys_call_handler+0xf7/0x170
vfs_read+0x98/0x120
ksys_read+0x5a/0xd0
do_syscall_64+0x43/0x140
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x43f910
Code: 01 f0 ff ff 0f 83 e0 57 00 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 19 f2 28 00 00 75 14 b8 00 00 00 00 0f 05 <48> 3d 01 f0 ff ff 0f 83 b4 57 00 00 c3 48 83 ec 08 e8 4a 39 00 00
RSP: 002b:00007fffffffeaa8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043f910
RDX: 0000008000000000 RSI: 0000000000000000 RDI: 0000000000000003
RBP: 00007fffffffed10 R08: 0000000000000000 R09: 00000000006cf880
R10: 00000000006cfb50 R11: 0000000000000246 R12: 0000000000401870
R13: 0000000000401900 R14: 0000000000000000 R15: 0000000000000000
---[ end trace 20146069c1ec4970 ]---

It's easy to reproduce by just doing

read(open("/proc/sys/vm/swappiness", O_RDONLY), 0, 512UL * 1024 * 1024 * 1024);

or so. Reverting the commit fixes the issue for me.


Vegard