Re: [patch 3/3] x86/fpu: Make FPU protection more robust

From: Jason A. Donenfeld
Date: Thu May 05 2022 - 07:02:21 EST


Hey again,

On Thu, May 05, 2022 at 03:21:43AM +0200, Thomas Gleixner wrote:
> > Thanks, I'll give it a shot in the morning (3am) when trying to do a
> > more realistic benchmark. But just as a synthetic thing, I ran the
> > numbers in kBench900 and am getting:
> >
> > generic: 430 cycles per call
> > ssse3: 315 cycles per call
> > avx512: 277 cycles per call
> >
> > for a single call to the compression function, which is the most any of
> > those mix_pool_bytes() calls do from add_{input,disk}_randomness(), on
> > Tiger Lake, using RDPMC from kernel space.
>
> I'm well aware of the difference between synthetic benchmarks and real
> world scenarios and with the more in depth instrumentation of these
> things I'm even more concerned that the difference is underestimated.
>
> > This _doesn't_ take into account the price of calling kernel_fpu_begin().
> > That's a little hard to bench synthetically by running it in a loop and
> > taking medians because of the lazy restoration. But that's an indication
> > anyway that I should be looking at the cost of the actual function as
> > its running in random.c, rather than the synthetic test. Will keep this
> > thread updated.
>
> Appreciated.

Interestingly, disabling the simd paths makes things around 283 cycles
slower on my Tiger Lake laptop, just doing ordinary things. I'm actually
slightly surprised, so I'll probably keep playing with this. My patch
for this is attached. Let me know if you have a different methodology in
mind...

Jason

diff --git a/drivers/char/random.c b/drivers/char/random.c
index bd292927654c..7a70a186c26b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -53,6 +53,7 @@
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
+#include <linux/sort.h>
#include <crypto/chacha.h>
#include <crypto/blake2s.h>
#include <asm/processor.h>
@@ -755,9 +756,48 @@ static struct {
.lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
};

+struct {
+ u32 durations[1 << 20];
+ u32 pos, len;
+} irqbench;
+
static void _mix_pool_bytes(const void *in, size_t nbytes)
{
+ u32 ctr = input_pool.hash.t[0];
+ cycles_t end, start = get_cycles();
blake2s_update(&input_pool.hash, in, nbytes);
+ end = get_cycles();
+
+ if (ctr == input_pool.hash.t[0] || !in_hardirq())
+ return;
+
+ irqbench.durations[irqbench.pos++ % ARRAY_SIZE(irqbench.durations)] = end - start;
+ irqbench.len = min_t(u32, irqbench.len + 1, ARRAY_SIZE(irqbench.durations));
+}
+
+static int cmp_u32(const void *a, const void *b)
+{
+ return *(const u32 *)a - *(const u32 *)b;
+}
+
+static int proc_do_irqbench_median(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ u32 len = READ_ONCE(irqbench.len), median, *sorted;
+ struct ctl_table fake_table = {
+ .data = &median,
+ .maxlen = sizeof(median)
+ };
+ if (!len)
+ return -ENODATA;
+ sorted = kmalloc_array(len, sizeof(*sorted), GFP_KERNEL);
+ if (!sorted)
+ return -ENOMEM;
+ memcpy(sorted, irqbench.durations, len * sizeof(*sorted));
+ sort(sorted, len, sizeof(*sorted), cmp_u32, NULL);
+ median = sorted[len / 2];
+ kfree(sorted);
+ return write ? 0 : proc_douintvec(&fake_table, 0, buffer, lenp, ppos);
}

/*
@@ -1709,6 +1749,18 @@ static struct ctl_table random_table[] = {
.mode = 0444,
.proc_handler = proc_do_uuid,
},
+ {
+ .procname = "irqbench_median",
+ .mode = 0444,
+ .proc_handler = proc_do_irqbench_median,
+ },
+ {
+ .procname = "irqbench_count",
+ .data = &irqbench.len,
+ .maxlen = sizeof(irqbench.len),
+ .mode = 0444,
+ .proc_handler = proc_douintvec,
+ },
{ }
};