Re: [PATCH] x86/traps: Don't for in_interrupt() to return true in IST handlers

From: Andy Lutomirski
Date: Tue May 24 2016 - 14:55:18 EST


On Tue, May 24, 2016 at 10:09 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Tue, May 24, 2016 at 08:43:57AM -0700, Andy Lutomirski wrote:
>> > So this has implications for code like
>> > kernel/events/internal.h:get_recursion_context() and
>> > kernel/trace/trace.c:get_trace_buf().
>> >
>> > Which use a sequence of: in_nmi(), in_irq(), in_softirq() to pick 1 out
>> > of 4 possible contexts.
>> >
>> > I would really like the Changelog to reflect on this. The current code
>> > will have ISTs land in in_irq(), with this chance, not so much.
>>
>> I can change the changelog.
>
> This is basically all I'm asking.
>
>> > Now ISTs as a whole invalidate the whole 'we have only 4 contexts' and
>> > the mapping back to those 4 is going to be somewhat arbitrary I suspect,
>> > but changes like this should be very much aware of these things. And
>> > make an explicit choice.
>>
>> I'm not so comfortable with trying to make any particular guarantees
>> about what all the in_xyz() things will return for different entry
>> types and how they nest. For example, debug can nest inside itself
>> quite easily (at one point I even had a user program to force it to
>> happen) -- this can trigger when a watchpoint nests inside a
>> single-step trap, and it can also happen when a watchpoint handler is
>> interrupted by an NMI than then recursively triggers the watchpoint.
>> The latter could easily result in nested NMIs that are directly
>> visible to the trace or event code.
>>
>> On x86, there's also MCE, which is NMI-ish, and NMI and MCE can freely
>> nest inside each other. (Blech.)
>
> Right; all the nesting possibilities are endless and insane. And I don't
> think it makes sense to try and enumerate them all and worse; expose
> this to generic code.
>
> I think having the 4: task, softirq, hardirq, nmi is fine. We just need
> to be a little careful with how we map to them, that we don't wreck some
> common situation and suddenly start loosing trace data.
>
>> Would it make more sense to adjust the trace code to have a percpu
>> nesting count and to match up get_trace_buf with put_trace_buf to
>> decrement the count? The event code looks like the same thing could
>> happen.
>
> We have, but its coupled with static storage, such as to avoid having to
> do it on stack or *horror* dynamically allocate.
>
> So in order to protect these resource we have the per context nesting
> count.


I gave it a shot and it looks straightforward and is a net deletion of
code. It's attached and compile-tested only. Is there a reason you
don't like this approach?

--Andy
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a4b49c99f22..acd366e4ab1a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1986,83 +1986,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)

/* created for use with alloc_percpu */
struct trace_buffer_struct {
- char buffer[TRACE_BUF_SIZE];
+ int nesting;
+ char buffer[4][TRACE_BUF_SIZE];
};

static struct trace_buffer_struct *trace_percpu_buffer;
-static struct trace_buffer_struct *trace_percpu_sirq_buffer;
-static struct trace_buffer_struct *trace_percpu_irq_buffer;
-static struct trace_buffer_struct *trace_percpu_nmi_buffer;

/*
- * The buffer used is dependent on the context. There is a per cpu
- * buffer for normal context, softirq contex, hard irq context and
- * for NMI context. Thise allows for lockless recording.
- *
- * Note, if the buffers failed to be allocated, then this returns NULL
+ * Thise allows for lockless recording. If we're nested too deeply, then
+ * this returns NULL.
*/
static char *get_trace_buf(void)
{
- struct trace_buffer_struct *percpu_buffer;
-
- /*
- * If we have allocated per cpu buffers, then we do not
- * need to do any locking.
- */
- if (in_nmi())
- percpu_buffer = trace_percpu_nmi_buffer;
- else if (in_irq())
- percpu_buffer = trace_percpu_irq_buffer;
- else if (in_softirq())
- percpu_buffer = trace_percpu_sirq_buffer;
- else
- percpu_buffer = trace_percpu_buffer;
+ struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);

- if (!percpu_buffer)
+ if (!buffer || buffer->nesting >= 4)
return NULL;

- return this_cpu_ptr(&percpu_buffer->buffer[0]);
+ return &buffer->buffer[buffer->nesting++][0];
+}
+
+static void put_trace_buf(void)
+{
+ this_cpu_dec(trace_percpu_buffer->nesting);
}

static int alloc_percpu_trace_buffer(void)
{
struct trace_buffer_struct *buffers;
- struct trace_buffer_struct *sirq_buffers;
- struct trace_buffer_struct *irq_buffers;
- struct trace_buffer_struct *nmi_buffers;

buffers = alloc_percpu(struct trace_buffer_struct);
- if (!buffers)
- goto err_warn;
-
- sirq_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!sirq_buffers)
- goto err_sirq;
-
- irq_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!irq_buffers)
- goto err_irq;
-
- nmi_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!nmi_buffers)
- goto err_nmi;
+ if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
+ return -ENOMEM;

trace_percpu_buffer = buffers;
- trace_percpu_sirq_buffer = sirq_buffers;
- trace_percpu_irq_buffer = irq_buffers;
- trace_percpu_nmi_buffer = nmi_buffers;
-
return 0;
-
- err_nmi:
- free_percpu(irq_buffers);
- err_irq:
- free_percpu(sirq_buffers);
- err_sirq:
- free_percpu(buffers);
- err_warn:
- WARN(1, "Could not allocate percpu trace_printk buffer");
- return -ENOMEM;
}

static int buffers_allocated;
@@ -2153,7 +2111,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
tbuffer = get_trace_buf();
if (!tbuffer) {
len = 0;
- goto out;
+ goto out_nobuffer;
}

len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
@@ -2179,6 +2137,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
}

out:
+ put_trace_buf();
+
+out_nobuffer:
preempt_enable_notrace();
unpause_graph_tracing();

@@ -2210,7 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
tbuffer = get_trace_buf();
if (!tbuffer) {
len = 0;
- goto out;
+ goto out_nobuffer;
}

len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
@@ -2229,7 +2190,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
}
- out:
+
+out_nobuffer:
+ put_trace_buf();
+
+out:
preempt_enable_notrace();
unpause_graph_tracing();