Re: [PATCH v1 1/1] rseq: Validate read-only fields under DEBUG_RSEQ config

From: Mathieu Desnoyers
Date: Tue Nov 12 2024 - 09:31:55 EST


On 2024-11-12 05:04, Peter Zijlstra wrote:
On Mon, Nov 11, 2024 at 02:22:14PM -0500, Mathieu Desnoyers wrote:

So I'm entirely agreeing with the intent, but perhaps we can argue a
little on the implementation :-)

Of course, thanks for providing feedback!


+#ifdef CONFIG_DEBUG_RSEQ
+static struct rseq *rseq_kernel_fields(struct task_struct *t)
+{
+ return (struct rseq *) t->rseq_fields;
+}
+
+static int rseq_validate_ro_fields(struct task_struct *t)
+{
+ u32 cpu_id_start, cpu_id, node_id, mm_cid;
+ struct rseq __user *rseq = t->rseq;
+
+ /*
+ * Validate fields which are required to be read-only by
+ * user-space.
+ */
+ if (!user_read_access_begin(rseq, t->rseq_len))
+ goto efault;
+ unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
+ unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_get_user(node_id, &rseq->node_id, efault_end);
+ unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
+ user_read_access_end();
+
+ if (cpu_id_start != rseq_kernel_fields(t)->cpu_id_start)
+ printk_ratelimited(KERN_WARNING
+ "Detected rseq cpu_id_start field corruption. Value: %u, expecting: %u (pid=%d).\n",
+ cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, t->pid);
+ if (cpu_id != rseq_kernel_fields(t)->cpu_id)
+ printk_ratelimited(KERN_WARNING
+ "Detected rseq cpu_id field corruption. Value: %u, expecting: %u (pid=%d).\n",
+ cpu_id, rseq_kernel_fields(t)->cpu_id, t->pid);
+ if (node_id != rseq_kernel_fields(t)->node_id)
+ printk_ratelimited(KERN_WARNING
+ "Detected rseq node_id field corruption. Value: %u, expecting: %u (pid=%d).\n",
+ node_id, rseq_kernel_fields(t)->node_id, t->pid);
+ if (mm_cid != rseq_kernel_fields(t)->mm_cid)
+ printk_ratelimited(KERN_WARNING
+ "Detected rseq mm_cid field corruption. Value: %u, expecting: %u (pid=%d).\n",
+ mm_cid, rseq_kernel_fields(t)->mm_cid, t->pid);

So aside from this just being ugly, this also has the problem of getting
the ratelimits out of sync and perhaps only showing partial corruption
for any one task.

Completely untested hackery below.

Your approach looks indeed better than mine, I'll steal it with your
permission. :)


@@ -423,6 +504,17 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
current->rseq = rseq;
current->rseq_len = rseq_len;
current->rseq_sig = sig;
+#ifdef CONFIG_DEBUG_RSEQ
+ /*
+ * Initialize the in-kernel rseq fields copy for validation of
+ * read-only fields.
+ */
+ if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
+ get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
+ get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
+ get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
+ return -EFAULT;
+#endif

So I didn't change the above, but wouldn't it make more sense to do
rseq_reset_rseq_cpu_node_id() here, but without the validation?

Indeed we could do this (for both DEBUG_RSEQ={y,n}), but it would add extra
useless stores to those userspace fields on rseq registration, which is
performed on every thread creation starting from glibc 2.35. The
rseq_set_notify_resume() invoked at the end of registration ensures that
those fields get populated before return to userspace.

So I am not against a more robust approach, but I'm reluctant to add redundant
work on thread creation.


---
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1364,6 +1364,15 @@ struct task_struct {
* with respect to preemption.
*/
unsigned long rseq_event_mask;
+# ifdef CONFIG_DEBUG_RSEQ
+ /*
+ * This is a place holder to save a copy of the rseq fields for
+ * validation of read-only fields. The struct rseq has a
+ * variable-length array at the end, so it cannot be used
+ * directly. Reserve a size large enough for the known fields.
+ */
+ char rseq_fields[sizeof(struct rseq)];
+# endif
#endif
#ifdef CONFIG_SCHED_MM_CID
--- a/kernel/rseq.c
+++ b/kernel/rseq.c

We should #include <linux/ratelimit.h> then.

@@ -25,6 +25,79 @@
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
+#ifdef CONFIG_DEBUG_RSEQ
+static struct rseq *rseq_kernel_fields(struct task_struct *t)
+{
+ return (struct rseq *) t->rseq_fields;
+}
+
+static int rseq_validate_ro_fields(struct task_struct *t)
+{
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ u32 cpu_id_start, cpu_id, node_id, mm_cid;
+ struct rseq __user *rseq = t->rseq;
+
+ /*
+ * Validate fields which are required to be read-only by
+ * user-space.
+ */
+ if (!user_read_access_begin(rseq, t->rseq_len))
+ goto efault;
+ unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
+ unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_get_user(node_id, &rseq->node_id, efault_end);
+ unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
+ user_read_access_end();
+
+ if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
+ cpu_id != rseq_kernel_fields(t)->cpu_id ||
+ node_id != rseq_kernel_fields(t)->node_id ||
+ mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
+
+ pr_warn("Detected rseq corruption for pid: %d;\n"
+ " cpu_id_start: %u ?= %u\n"
+ " cpu_id: %u ?= %u\n"
+ " node_id: %u ?= %u\n"
+ " mm_cid: %u ?= %u\n"
+ t->pid,
+ cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
+ cpu_id, rseq_kernel_fields(t)->cpu_id,
+ node_id, rseq_kernel_fields(t)->node_id,
+ mm_cid, rseq_kernel_fields(t)->mm_cid);
+ }

It looks better, thanks.

+
+ /* For now, only print a console warning on mismatch. */
+ return 0;
+
+efault_end:
+ user_read_access_end();
+efault:
+ return -EFAULT;
+}
+
+static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
+ u32 node_id, u32 mm_cid)
+{
+ rseq_kernel_fields(t)->cpu_id_start = cpu_id;
+ rseq_kernel_fields(t)->cpu_id = cpu_id;
+ rseq_kernel_fields(t)->node_id = node_id;
+ rseq_kernel_fields(t)->mm_cid = mm_cid;
+}

This too.

Thanks!

Mathieu

--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com