RE: [PATCH] x86/entry/64: randomize kernel stack offset upon syscall

From: Reshetova, Elena
Date: Tue Apr 30 2019 - 13:51:32 EST


>
> > On Apr 29, 2019, at 12:46 AM, Reshetova, Elena <elena.reshetova@xxxxxxxxx>
> wrote:
> >
> >
> >>>> On Apr 26, 2019, at 7:01 AM, Theodore Ts'o <tytso@xxxxxxx> wrote:
> >>>
> >
> >> It seems to me
> >> that we should be using the âfast-erasureâ construction for all
> get_random_bytes()
> >> invocations. Specifically, we should have a per cpu buffer that stores some
> random
> >> bytes and a count of how many random bytes there are. get_random_bytes()
> should
> >> take bytes from that buffer and *immediately* zero those bytes in memory.
> When
> >> the buffer is empty, it gets refilled with the full strength CRNG.
> >
> > Ideally it would be great to call smth fast and secure on each syscall without a per-
> cpu
> > buffer,
>
> Why? You only need a few bits, and any sensible crypto primitive is going to be
> much better at producing lots of bits than producing just a few bits. Even ignoring
> that, avoiding the I-cache hit on every syscall has value. And I still donât see whatâs
> wrong with a percpu buffer.

I guess this is true, so I did a quick implementation now to estimate the
performance hits.
Here are the preliminary numbers (proper ones will take a bit more time):

base: Simple syscall: 0.1761 microseconds
get_random_bytes (4096 bytes per-cpu buffer): 0.1793 microsecons
get_random_bytes (64 bytes per-cpu buffer): 0.1866 microsecons

It does not make sense to go less than 64 bytes since this seems to be
Chacha20 block size, so if we go lower, we will trash useful bits.
You can go even higher than 4096 bytes, but even this looks like
okish performance to me.

Below is a snip of what I quickly did (relevant parts) to get these numbers.
I do initial population of per-cpu buffers in late_initcall, but
practice shows that rng might not always be in good state by then.
So, we might not have really good randomness then, but I am not sure
if this is a practical problem since it only applies to system boot and by
the time it booted, it already issued enough syscalls that buffer gets refilled
with really good numbers.
Alternatively we can also do it on the first syscall that each cpu gets, but I
am not sure if that is always guaranteed to have a good randomness.

+#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET
+#include <linux/random.h>
+
+void *__builtin_alloca(size_t size);
+
+#define add_random_stack_offset() do { \
+ size_t offset = random_get_byte(); \
+ char *ptr = __builtin_alloca(offset); \
+ asm volatile("":"=m"(*ptr)); \
+} while (0)
+#else
+#define add_random_stack_offset() do {} while (0)
+#endif

...

diff --git a/include/linux/random.h b/include/linux/random.h
index 445a0ea4ff49..9fbce9d6ee70 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -115,6 +115,15 @@ struct rnd_state {
__u32 s1, s2, s3, s4;
};

+#define RANDOM_BUFFER_SIZE 64
+/* structure to hold random bits */
+struct rnd_buffer {
+ unsigned char buffer[RANDOM_BUFFER_SIZE];
+ __u64 byte_counter;
+};
+unsigned char random_get_byte(void);
+
+

....

diff --git a/lib/percpu-random.c b/lib/percpu-random.c
new file mode 100644
index 000000000000..3f92c44fbc1a
--- /dev/null
+++ b/lib/percpu-random.c
@@ -0,0 +1,49 @@
+#include <linux/types.h>
+#include <linux/percpu.h>
+#include <linux/random.h>
+
+static DEFINE_PER_CPU(struct rnd_buffer, stack_rand_offset) __latent_entropy;
+
+
+/*
+ * Generate some initially weak seeding values to allow
+ * to start the prandom_u32() engine.
+ */
+static int __init stack_rand_offset_init(void)
+{
+ int i;
+
+ /* exctract bits to out per-cpu rand buffers */
+ for_each_possible_cpu(i) {
+ struct rnd_buffer *buffer = &per_cpu(stack_rand_offset, i);
+ buffer->byte_counter = 0;
+ /* if rng is not initialized, this won't extract us good stuff
+ * but we cannot wait for rng to initialize either */
+ get_random_bytes(&(buffer->buffer), sizeof(buffer->buffer));
+
+ }
+
+ return 0;
+}
+late_initcall(stack_rand_offset_init);
+
+unsigned char random_get_byte(void)
+{
+ struct rnd_buffer *buffer = &get_cpu_var(stack_rand_offset);
+ unsigned char res;
+
+ if (buffer->byte_counter >= RANDOM_BUFFER_SIZE) {
+ get_random_bytes(&(buffer->buffer), sizeof(buffer->buffer));
+ buffer->byte_counter = 0;
+ }
+
+ res = buffer->buffer[buffer->byte_counter];
+ buffer->buffer[buffer->byte_counter] = 0;
+ buffer->byte_counter ++;
+ put_cpu_var(stack_rand_offset);
+ return res;
+}
+EXPORT_SYMBOL(random_get_byte);