audit: io_uring openat triggers audit reference count underflow in worker thread
From: Dan Clash
Date: Fri Oct 06 2023 - 16:09:42 EST
This discussion is about how to fix an audit reference count decrement
race between two io_uring threads.
Original discussion link:
https : / / github . com / axboe / liburing / issues / 958
Details:
The test program below hangs indefinitely waiting for an openat cqe. The
reproduction is with a distro kernel Ubuntu-azure-6.2-6.2.0-1012.12_22.04.1.
However, the bug seems possible with an upstream kernel. An experiment of
changing the reference count in struct filename from int to refcount_t allows
the test program to complete.
The bug did not occur with this test program until a kernel containing
commit 5bd2182d58e9 was used. I have not found a matching reported issue
or upstream commit yet.
The dmseg log shows an audit related path:
[27883.992550] kernel BUG at fs/namei.c:262!
[27883.994051] invalid opcode: 0000 [#15] SMP PTI
[27883.995719] CPU: 3 PID: 84988 Comm: iou-wrk-84835 Tainted: G D
6.2.0-1012-azure #12~22.04.1-Ubuntu
[27883.999064] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine,
BIOS Hyper-V UEFI Release v4.1 05/09/2022
[27884.002734] RIP: 0010:putname+0x68/0x70
...
[27884.032893] Call Trace:
[27884.034032] <TASK>
[27884.035117] ? show_regs+0x6a/0x80
[27884.036763] ? die+0x38/0xa0
[27884.038023] ? do_trap+0xd0/0xf0
[27884.039359] ? do_error_trap+0x70/0x90
[27884.040861] ? putname+0x68/0x70
[27884.042201] ? exc_invalid_op+0x53/0x70
[27884.043698] ? putname+0x68/0x70
[27884.045076] ? asm_exc_invalid_op+0x1b/0x20
[27884.047051] ? putname+0x68/0x70
[27884.048415] audit_reset_context.part.0.constprop.0+0xe1/0x300
[27884.050511] __audit_uring_exit+0xda/0x1c0
[27884.052100] io_issue_sqe+0x1f3/0x450
[27884.053702] ? lock_timer_base+0x3b/0xd0
[27884.055283] io_wq_submit_work+0x8d/0x2b0
[27884.056848] ? __try_to_del_timer_sync+0x67/0xa0
[27884.058577] io_worker_handle_work+0x17c/0x2b0
[27884.060267] io_wqe_worker+0x10a/0x350
[27884.061714] ? raw_spin_rq_unlock+0x10/0x30
[27884.063295] ? finish_task_switch.isra.0+0x8b/0x2c0
[27884.065537] ? __pfx_io_wqe_worker+0x10/0x10
[27884.067215] ret_from_fork+0x2c/0x50
[27884.068733] RIP: 0033:0x0
...
Test program usage:
./io_uring_open_close_audit_hang --directory /tmp/deleteme --count 10000
Test program source:
// Note: The test program is C++ but could be converted to C.
#include <cassert>
#include <fcntl.h>
#include <filesystem>
#include <getopt.h>
#include <iostream>
#include <liburing.h>
// open and close a file. the file is created if it does not exist.
void
openClose(struct io_uring& ring, std::string fileName)
{
int ret;
struct io_uring_cqe* cqe {};
struct io_uring_sqe* sqe {};
int fd {};
int flags {O_RDWR | O_CREAT};
mode_t mode {0666};
// openat2
sqe = io_uring_get_sqe(&ring);
assert(sqe != nullptr);
io_uring_prep_openat(sqe, AT_FDCWD, fileName.data(), flags, mode);
io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
ret = io_uring_submit(&ring);
assert(ret == 1);
ret = io_uring_wait_cqe(&ring, &cqe);
assert(ret == 0);
fd = cqe->res;
assert(fd > 0);
io_uring_cqe_seen(&ring, cqe);
// close
sqe = io_uring_get_sqe(&ring);
assert(sqe != nullptr);
io_uring_prep_close(sqe, fd);
io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
ret = io_uring_submit(&ring);
assert(ret == 1);
// wait for the close to complete.
ret = io_uring_wait_cqe(&ring, &cqe);
assert(ret == 0);
// verify that close succeeded.
assert(cqe->res == 0);
io_uring_cqe_seen(&ring, cqe);
}
// create 100 files and then open each file twice.
void
openCloseHang(std::string filePath)
{
int ret;
struct io_uring ring;
ret = io_uring_queue_init(8, &ring, 0);
assert(0 == ret);
int repeat {3};
int numFiles {100};
std::filesystem::create_directory(filePath);
// files of length 0 are created in the j==0 iteration below.
// those files are opened and closed during the j>0 iteraions.
// a repeat of 3 results in a fairly reliable reproduction.
for (int j = 0; j < repeat; j += 1) {
for (int i = 0; i < numFiles; i += 1) {
std::string fileName(filePath + "/file" + std::to_string(i));
openClose(ring, fileName);
}
}
std::filesystem::remove_all(filePath);
io_uring_queue_exit(&ring);
}
int
main(int argc, char** argv)
{
std::string filePath {};
int iterations {};
struct option options[]
{
{"help", no_argument, 0, 'h'}, {"directory", required_argument, 0, 'd'},
{"count", required_argument, 0, 'c'},
{
0, 0, 0, 0
}
};
bool printUsage {false};
int val {};
while ((val = getopt_long_only(argc, argv, "", options, nullptr)) != -1) {
if (val == 'h') {
printUsage = true;
} else if (val == 'd') {
filePath = optarg;
if (std::filesystem::exists(filePath)) {
printUsage = true;
std::cerr << "directory must not exist" << std::endl;
}
} else if (val == 'c') {
iterations = atoi(optarg);
if (0 == iterations) {
printUsage = true;
}
} else {
printUsage = true;
}
}
if ((0 == iterations) || (filePath.empty())) {
printUsage = true;
}
if (printUsage || (optind < argc)) {
std::cerr << "io_uring_open_close_audit_hang.cc --directory DIR --count COUNT" << std::endl;
exit(1);
}
for (int i = 0; i < iterations; i += 1) {
if (0 == (i % 100)) {
std::cout << "i=" << std::to_string(i) << std::endl;
}
openCloseHang(filePath);
}
return 0;
}
Changing the reference count from int to refcount_t allows the test program
to complete using the v6.2 distro kernel. The patch applies and builds on the
upstream v6.1.55 kernel.
Signed-off-by: Dan Clash <dan.clash@xxxxxxxxxxxxx>
---
diff --git a/fs/namei.c b/fs/namei.c
index 2a8baa6ce3e8..4f7ac131c9d1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -187,7 +187,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
}
}
- result->refcnt = 1;
+ refcount_set(&result->refcnt, 1);
/* The empty path is special. */
if (unlikely(!len)) {
if (empty)
@@ -248,7 +248,7 @@ getname_kernel(const char * filename)
memcpy((char *)result->name, filename, len);
result->uptr = NULL;
result->aname = NULL;
- result->refcnt = 1;
+ refcount_set(&result->refcnt, 1);
audit_getname(result);
return result;
@@ -259,9 +259,10 @@ void putname(struct filename *name)
if (IS_ERR(name))
return;
- BUG_ON(name->refcnt <= 0);
+ BUG_ON(refcount_read(&name->refcnt) == 0);
+ BUG_ON(refcount_read(&name->refcnt) == REFCOUNT_SATURATED);
- if (--name->refcnt > 0)
+ if (!refcount_dec_and_test(&name->refcnt))
return;
if (name->name != name->iname) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d0a54e9aac7a..8217e07726d4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2719,7 +2719,7 @@ struct audit_names;
struct filename {
const char *name; /* pointer to actual string */
const __user char *uptr; /* original userland pointer */
- int refcnt;
+ refcount_t refcnt;
struct audit_names *aname;
const char iname[];
};
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 37cded22497e..232e0be9f6d9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2188,7 +2188,7 @@ __audit_reusename(const __user char *uptr)
if (!n->name)
continue;
if (n->name->uptr == uptr) {
- n->name->refcnt++;
+ refcount_inc(&n->name->refcnt);
return n->name;
}
}
@@ -2217,7 +2217,7 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
- name->refcnt++;
+ refcount_inc(&name->refcnt);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2349,7 +2349,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
return;
if (name) {
n->name = name;
- name->refcnt++;
+ refcount_inc(&name->refcnt);
}
out:
@@ -2474,7 +2474,7 @@ void __audit_inode_child(struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- found_child->name->refcnt++;
+ refcount_inc(&found_child->name->refcnt);
}
}