[PATCH] tracing: (backport) Replace kmap with copy_from_user() in trace_marker
From: Henrik Austad
Date: Fri Dec 09 2016 - 01:34:53 EST
Instead of using get_user_pages_fast() and kmap_atomic() when writing
to the trace_marker file, just allocate enough space on the ring buffer
directly, and write into it via copy_from_user().
Writing into the trace_marker file use to allocate a temporary buffer
to perform the copy_from_user(), as we didn't want to write into the
ring buffer if the copy failed. But as a trace_marker write is suppose
to be extremely fast, and allocating memory causes other tracepoints to
trigger, Peter Zijlstra suggested using get_user_pages_fast() and
kmap_atomic() to keep the user space pages in memory and reading it
directly.
Instead, just allocate the space in the ring buffer and use
copy_from_user() directly. If it faults, return -EFAULT and write
"<faulted>" into the ring buffer.
On architectures without a arch-specific get_user_pages_fast(), this
will end up in the generic get_user_pages_fast() and this grabs
mm->mmap_sem. Once you do this, then suddenly writing to the
trace_marker can cause priority-inversions.
This is a backport of Steven Rostedts patch [1] and applied to 3.10.x so the
signed-off-chain by is somewhat uncertain at this stage.
The patch compiles, boots and does not immediately explode on impact. By
definition [2] it must therefore be perfect
2) https://www.spinics.net/lists/kernel/msg2400769.html
2) http://lkml.iu.edu/hypermail/linux/kernel/9804.1/0149.html
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Henrik Austad <henrik@xxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: stable@xxxxxxxxxxxxxxx
Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Used-to-be-signed-off-by: Steven Rostedt <rostedt@xxxxxxxxxxx>
Backported-by: Henrik Austad <haustad@xxxxxxxxx>
Tested-by: Henrik Austad <haustad@xxxxxxxxx>
Signed-off-by: Henrik Austad <haustad@xxxxxxxxx>
---
kernel/trace/trace.c | 78 +++++++++++++++-------------------------------------
1 file changed, 22 insertions(+), 56 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 18cdf91..94eb1ee 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4501,15 +4501,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
struct ring_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
- struct page *pages[2];
- void *map_page[2];
- int nr_pages = 1;
+ const char faulted[] = "<faulted>";
ssize_t written;
- int offset;
int size;
int len;
- int ret;
- int i;
+
+/* Used in tracing_mark_raw_write() as well */
+#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */
if (tracing_disabled)
return -EINVAL;
@@ -4520,60 +4518,34 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (cnt > TRACE_BUF_SIZE)
cnt = TRACE_BUF_SIZE;
- /*
- * Userspace is injecting traces into the kernel trace buffer.
- * We want to be as non intrusive as possible.
- * To do so, we do not want to allocate any special buffers
- * or take any locks, but instead write the userspace data
- * straight into the ring buffer.
- *
- * First we need to pin the userspace buffer into memory,
- * which, most likely it is, because it just referenced it.
- * But there's no guarantee that it is. By using get_user_pages_fast()
- * and kmap_atomic/kunmap_atomic() we can get access to the
- * pages directly. We then write the data directly into the
- * ring buffer.
- */
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- /* check if we cross pages */
- if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
- nr_pages = 2;
-
- offset = addr & (PAGE_SIZE - 1);
- addr &= PAGE_MASK;
-
- ret = get_user_pages_fast(addr, nr_pages, 0, pages);
- if (ret < nr_pages) {
- while (--ret >= 0)
- put_page(pages[ret]);
- written = -EFAULT;
- goto out;
- }
+ local_save_flags(irq_flags);
+ size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
- for (i = 0; i < nr_pages; i++)
- map_page[i] = kmap_atomic(pages[i]);
+ /* If less than "<faulted>", then make sure we can still add that */
+ if (cnt < FAULTED_SIZE)
+ size += FAULTED_SIZE - cnt;
- local_save_flags(irq_flags);
- size = sizeof(*entry) + cnt + 2; /* possible \n added */
buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
irq_flags, preempt_count());
- if (!event) {
- /* Ring buffer disabled, return as if not open for write */
- written = -EBADF;
- goto out_unlock;
- }
+
+ if (unlikely(!event))
+ /* Ring buffer disabled, return as if not open for write */
+ return -EBADF;
entry = ring_buffer_event_data(event);
entry->ip = _THIS_IP_;
- if (nr_pages == 2) {
- len = PAGE_SIZE - offset;
- memcpy(&entry->buf, map_page[0] + offset, len);
- memcpy(&entry->buf[len], map_page[1], cnt - len);
+ len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
+ if (len) {
+ memcpy(&entry->buf, faulted, FAULTED_SIZE);
+ cnt = FAULTED_SIZE;
+ written = -EFAULT;
} else
- memcpy(&entry->buf, map_page[0] + offset, cnt);
+ written = cnt;
+ len = cnt;
if (entry->buf[cnt - 1] != '\n') {
entry->buf[cnt] = '\n';
@@ -4583,16 +4555,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
__buffer_unlock_commit(buffer, event);
- written = cnt;
- *fpos += written;
+ if (written > 0)
+ *fpos += written;
- out_unlock:
- for (i = 0; i < nr_pages; i++){
- kunmap_atomic(map_page[i]);
- put_page(pages[i]);
- }
- out:
return written;
}
--
2.7.4