[RFC PATCH v2 3/3] restartable sequences: basic self-tests

From: Paul Turner
Date: Tue Oct 27 2015 - 19:57:32 EST


From: pjt <pjt@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>

Implements two basic tests of RSEQ functionality.

The first, "basic_test" only asserts that RSEQ works moderately correctly.
E.g. that:
- The CPUID pointer works
- Code infinitely looping within a critical section will eventually be
interrupted.
- Critical sections are interrupted by signals.

"basic_percpu_ops_test" is a slightly more "realistic" variant, implementing a
few simple per-cpu operations and testing their correctness. It also includes
a trivial example of user-space may multiplexing the critical section via the
restart handler.

Signed-off-by: Paul Turner <pjt@xxxxxxxxxx>
---
tools/testing/selftests/rseq/Makefile | 13 +
.../testing/selftests/rseq/basic_percpu_ops_test.c | 268 ++++++++++++++++++++
tools/testing/selftests/rseq/basic_test.c | 87 ++++++
tools/testing/selftests/rseq/rseq.c | 36 +++
tools/testing/selftests/rseq/rseq.h | 109 ++++++++
5 files changed, 513 insertions(+)
create mode 100644 tools/testing/selftests/rseq/Makefile
create mode 100644 tools/testing/selftests/rseq/basic_percpu_ops_test.c
create mode 100644 tools/testing/selftests/rseq/basic_test.c
create mode 100644 tools/testing/selftests/rseq/rseq.c
create mode 100644 tools/testing/selftests/rseq/rseq.h

diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
new file mode 100644
index 0000000..40b9338
--- /dev/null
+++ b/tools/testing/selftests/rseq/Makefile
@@ -0,0 +1,13 @@
+CFLAGS += -Wall
+LDFLAGS += -lpthread
+
+TESTS = basic_test basic_percpu_ops_test
+
+all: $(TESTS)
+%: %.c
+ $(CC) $(CFLAGS) -o $@ $^ rseq.c $(LDFLAGS)
+
+include ../lib.mk
+
+clean:
+ $(RM) $(TESTS)
diff --git a/tools/testing/selftests/rseq/basic_percpu_ops_test.c b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
new file mode 100644
index 0000000..dcc57ad
--- /dev/null
+++ b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
@@ -0,0 +1,268 @@
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "rseq.h"
+
+struct percpu_lock {
+ intptr_t word[CPU_SETSIZE][64 / sizeof(intptr_t)]; /* Cache aligned */
+};
+
+/* A simple percpu spinlock. Returns the cpu lock was acquired on. */
+int rseq_percpu_lock(struct percpu_lock *lock)
+{
+ struct rseq_state start;
+ int cpu;
+
+ do {
+ start = rseq_start();
+ cpu = rseq_cpu_at_start(start);
+ } while (lock->word[cpu][0] ||
+ !rseq_finish(&lock->word[cpu][0], 1, start));
+ return cpu;
+}
+
+void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+{
+ barrier(); /* need a release-store here, this suffices on x86. */
+ assert(lock->word[cpu][0] == 1);
+ lock->word[cpu][0] = 0;
+}
+
+/*
+ * cmpxchg [with an additional check value].
+ *
+ * Returns:
+ * -1 if *p != old [ || check_ptr != check_val, ] otherwise
+ * cpu that rseq_percpu_cmpxchgcheck was executed.
+ * - If this is different from the passed cpu, no modifications were made.
+ *
+ * Note: When specified, check_ptr is dereferenced iff *p == old
+ */
+int rseq_percpu_cmpxchg(int cpu, intptr_t *p, intptr_t old, intptr_t new)
+{
+ struct rseq_state start;
+
+ while (1) {
+ start = rseq_start();
+ if (rseq_current_cpu() != cpu) return rseq_current_cpu();
+ if (*p != old)
+ return -1;
+ if (rseq_finish(p, new, start)) return cpu;
+ }
+}
+
+int rseq_percpu_cmpxchgcheck(int cpu, intptr_t *p, intptr_t old, intptr_t new,
+ intptr_t *check_ptr, intptr_t check_val)
+{
+ struct rseq_state start;
+
+ while (1) {
+ start = rseq_start();
+ if (rseq_current_cpu() != cpu) return rseq_current_cpu();
+ /*
+ * Note that we'd want the ultimate implementation of this to
+ * be open coded (similar to rseq_finish) so that we can
+ * guarantee *check is not dereferenced when old does not
+ * match. This could also be facilitated with a generic
+ * rseq_read_if_valid(...) helper.
+ */
+ if (*p != old || *check_ptr != check_val)
+ return -1;
+ if (rseq_finish(p, new, start)) return cpu;
+ }
+}
+
+void rseq_unknown_restart_addr(void *addr)
+{
+ fprintf(stderr, "rseq: unrecognized restart address %p\n", addr);
+ exit(1);
+}
+
+struct spinlock_test_data {
+ struct percpu_lock lock;
+ int counts[CPU_SETSIZE];
+ int reps;
+};
+
+void *test_percpu_spinlock_thread(void *arg)
+{
+ struct spinlock_test_data *data = arg;
+
+ int i, cpu;
+ rseq_init_current_thread();
+ for (i = 0; i < data->reps; i++) {
+ cpu = rseq_percpu_lock(&data->lock);
+ data->counts[cpu]++;
+ rseq_percpu_unlock(&data->lock, cpu);
+ }
+
+ return 0;
+}
+
+/*
+ * A simple test which implements a sharded counter using a per-cpu lock.
+ * Obviously real applications might prefer to simply use a per-cpu increment;
+ * however, this is reasonable for a test and the lock can be extended to
+ * synchronize more complicated operations.
+ */
+void test_percpu_spinlock()
+{
+ const int num_threads = 200;
+ int i, sum;
+ pthread_t test_threads[num_threads];
+ struct spinlock_test_data data;
+
+ memset(&data, 0, sizeof(data));
+ data.reps = 5000;
+
+ for (i = 0; i < num_threads; i++)
+ pthread_create(&test_threads[i], NULL,
+ test_percpu_spinlock_thread, &data);
+
+ for (i = 0; i < num_threads; i++)
+ pthread_join(test_threads[i], NULL);
+
+ sum = 0;
+ for (i = 0; i < CPU_SETSIZE; i++)
+ sum += data.counts[i];
+
+ assert(sum == data.reps * num_threads);
+}
+
+struct percpu_list_node {
+ intptr_t data;
+ struct percpu_list_node *next;
+};
+
+struct percpu_list {
+ struct percpu_list_node *heads[CPU_SETSIZE];
+};
+
+int percpu_list_push(struct percpu_list *list, struct percpu_list_node *node)
+{
+ int cpu;
+
+ do {
+ cpu = rseq_current_cpu();
+ node->next = list->heads[cpu];
+ } while (cpu != rseq_percpu_cmpxchg(cpu,
+ (intptr_t *)&list->heads[cpu], (intptr_t)node->next,
+ (intptr_t)node));
+
+ return cpu;
+}
+
+struct percpu_list_node *percpu_list_pop(struct percpu_list *list)
+{
+ int cpu;
+ struct percpu_list_node *head, *next;
+
+ do {
+ cpu = rseq_current_cpu();
+ head = list->heads[cpu];
+ /*
+ * Unlike a traditional lock-less linked list; the availability
+ * of a cmpxchg-check primitive allows us to implement pop
+ * without concerns over ABA-type races.
+ */
+ if (!head) return 0;
+ next = head->next;
+ } while (cpu != rseq_percpu_cmpxchgcheck(cpu,
+ (intptr_t *)&list->heads[cpu], (intptr_t)head, (intptr_t)next,
+ (intptr_t *)&head->next, (intptr_t)next));
+
+ return head;
+}
+
+
+void *test_percpu_list_thread(void *arg)
+{
+ int i;
+ struct percpu_list *list = (struct percpu_list *)arg;
+
+ rseq_init_current_thread();
+ for (i = 0; i < 100000; i++) {
+ struct percpu_list_node *node = percpu_list_pop(list);
+ sched_yield(); /* encourage shuffling */
+ if (node) percpu_list_push(list, node);
+ }
+
+ return 0;
+}
+
+/* Simultaneous modification to a per-cpu linked list from many threads. */
+void test_percpu_list()
+{
+ int i, j;
+ long sum = 0, expected_sum = 0;
+ struct percpu_list list;
+ pthread_t test_threads[200];
+ cpu_set_t allowed_cpus;
+
+ memset(&list, 0, sizeof(list));
+
+ /* Generate list entries for every usable cpu. */
+ sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (!CPU_ISSET(i, &allowed_cpus)) continue;
+ for (j = 1; j <= 100; j++) {
+ struct percpu_list_node *node;
+
+ expected_sum += j;
+
+ node = malloc(sizeof(*node));
+ assert(node);
+ node->data = j;
+ node->next = list.heads[i];
+ list.heads[i] = node;
+ }
+ }
+
+ for (i = 0; i < 200; i++)
+ assert(pthread_create(&test_threads[i], NULL,
+ test_percpu_list_thread, &list) == 0);
+
+ for (i = 0; i < 200; i++)
+ pthread_join(test_threads[i], NULL);
+
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ cpu_set_t pin_mask;
+ struct percpu_list_node *node;
+
+ if (!CPU_ISSET(i, &allowed_cpus)) continue;
+
+ CPU_ZERO(&pin_mask);
+ CPU_SET(i, &pin_mask);
+ sched_setaffinity(0, sizeof(pin_mask), &pin_mask);
+
+ while ((node = percpu_list_pop(&list))) {
+ sum += node->data;
+ free(node);
+ }
+ }
+
+ /*
+ * All entries should now be accounted for (unless some external actor
+ * is interfering with our allowed affinity while this test is
+ * running).
+ */
+ assert(sum == expected_sum);
+}
+
+int main(int argc, char **argv)
+{
+ rseq_init_current_thread();
+ printf("spinlock\n");
+ test_percpu_spinlock();
+ printf("percpu_list\n");
+ test_percpu_list();
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/rseq/basic_test.c b/tools/testing/selftests/rseq/basic_test.c
new file mode 100644
index 0000000..a3d3cdf
--- /dev/null
+++ b/tools/testing/selftests/rseq/basic_test.c
@@ -0,0 +1,87 @@
+/*
+ * Basic test coverage for critical regions and rseq_current_cpu().
+ */
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "rseq.h"
+
+void test_cpu_pointer()
+{
+ cpu_set_t affinity, test_affinity;
+ int i;
+
+ sched_getaffinity(0, sizeof(affinity), &affinity);
+ CPU_ZERO(&test_affinity);
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (CPU_ISSET(i, &affinity)) {
+ CPU_SET(i, &test_affinity);
+ sched_setaffinity(0, sizeof(test_affinity),
+ &test_affinity);
+ assert(rseq_current_cpu() == sched_getcpu());
+ assert(rseq_current_cpu() == i);
+ CPU_CLR(i, &test_affinity);
+ }
+ }
+ sched_setaffinity(0, sizeof(affinity), &affinity);
+}
+
+void test_critical_section()
+{
+ /*
+ * This depends solely on some environmental event triggering a counter
+ * increase.
+ */
+ struct rseq_state start = rseq_start(), current;
+ do {
+ current = rseq_start();
+ } while (start.cpu == current.cpu &&
+ start.event_counter == current.event_counter);
+}
+
+volatile int signals_delivered;
+volatile struct rseq_state start;
+
+void test_signal_interrupt_handler()
+{
+ struct rseq_state current = rseq_start();
+ /*
+ * The potential critical section bordered by 'start' must be invalid.
+ */
+ assert(current.cpu != start.cpu ||
+ current.event_counter != start.event_counter);
+ signals_delivered++;
+}
+
+void test_signal_interrupts()
+{
+ struct itimerval it = {{0, 1}, {0, 1}};
+ setitimer(ITIMER_PROF, &it, NULL);
+ signal(SIGPROF, test_signal_interrupt_handler);
+
+ do {
+ start = rseq_start();
+ } while (signals_delivered < 10);
+ setitimer(ITIMER_PROF, NULL, NULL);
+}
+
+int main(int argc, char **argv)
+{
+ rseq_init_current_thread();
+
+ printf("testing current cpu\n");
+ test_cpu_pointer();
+ printf("testing critical section\n");
+ test_critical_section();
+ printf("testing critical section is interrupted by signal\n");
+ test_signal_interrupts();
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
new file mode 100644
index 0000000..0ed5c8e
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -0,0 +1,36 @@
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "rseq.h"
+
+__thread volatile struct rseq_state __rseq_state = { .cpu=-1 };
+
+#define __NR_rseq 325
+
+int sys_rseq(int flags,
+ volatile uint64_t* event_and_cpu,
+ volatile void* post_commit_instr)
+{
+ return syscall(__NR_rseq, flags,
+ (intptr_t)event_and_cpu,
+ (intptr_t)post_commit_instr);
+}
+
+void rseq_init_current_thread(void)
+{
+ int rc = sys_rseq(0, &__rseq_state.storage,
+ &__rseq_state.post_commit_instr);
+ if (rc) {
+ fprintf(stderr,"Error: sys_rseq(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ exit(1);
+ }
+ assert(rseq_current_cpu() != -1); /* always updated prior to return. */
+}
+
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
new file mode 100644
index 0000000..d16e02d
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -0,0 +1,109 @@
+#ifndef RSEQ_H
+#define RSEQ_H
+
+#include <stdint.h>
+
+struct rseq_state {
+ union {
+ /*
+ * Updated by the kernel. The time between two rseq state
+ * objects can be considered non-interrupted if-and-only-if
+ * both cpu and event_counter match.
+ *
+ * Explicitly: the kernel is allowed to maintain a
+ * per-cpu event_counter.
+ */
+ struct {
+ int cpu;
+ int event_counter;
+ };
+ uint64_t storage;
+ };
+ void* post_commit_instr;
+};
+
+extern __thread volatile struct rseq_state __rseq_state;
+
+int sys_rseq(int flags,
+ volatile uint64_t* event_and_cpu,
+ volatile void* post_commit_instr);
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+
+static inline struct rseq_state rseq_start()
+{
+ struct rseq_state result = __rseq_state;
+ /*
+ * We need to ensure that the compiler does not re-order the loads of
+ * any protected values before we read the current state.
+ */
+ barrier();
+ return result;
+}
+
+static inline int rseq_cpu_at_start(struct rseq_state start_value)
+{
+ return start_value.cpu;
+}
+
+static inline int rseq_current_cpu(void)
+{
+ return __rseq_state.cpu;
+}
+
+static inline int rseq_finish(intptr_t *p, intptr_t to_write,
+ struct rseq_state start_value)
+{
+#ifdef __x86_64__
+ __asm__ __volatile__ goto (
+ "movq $%l[failed], %%rcx\n"
+ "movq $1f, %[commit_instr]\n"
+ "cmpq %[start_value], %[current_value]\n"
+ "jnz %l[failed]\n"
+ "movq %[to_write], (%[target])\n"
+ "1: movq $0, %[commit_instr]\n"
+ : /* no outputs */
+ : [start_value]"d"(start_value.storage),
+ [current_value]"m"(__rseq_state),
+ [to_write]"r"(to_write),
+ [target]"r"(p),
+ [commit_instr]"m"(__rseq_state.post_commit_instr)
+ : "rcx", "memory"
+ : failed
+ );
+#elif __i386__
+ __asm__ __volatile__ goto (
+ "movl $%l[failed], %%ecx\n"
+ "movl $1f, %[post_commit_instr]\n"
+ "cmp %[start_cpu], %[current_cpu]\n"
+ "jnz %l[failed]\n"
+ "cmp %[start_event], %[current_event]\n"
+ "jnz %l[failed]\n"
+ "movl %[to_write], (%[target])\n"
+ "1: movl $0, %[post_commit_instr]\n"
+ : /* no outputs */
+ : [start_cpu]"a"(start_value.cpu),
+ [start_event]"d"(start_value.event_counter),
+ [current_cpu]"g"(__rseq_state.cpu),
+ [current_event]"g"(__rseq_state.event_counter),
+ [post_commit_instr]"g"(__rseq_state.post_commit_instr),
+ [to_write]"r"(to_write),
+ [target]"r"(p)
+ : "ecx", "memory"
+ : failed
+ );
+#else
+#error unsupported target
+#endif
+ return 1;
+failed:
+ return 0;
+}
+
+/*
+ * Initialize rseq for the current thread. Must be called once by any thread
+ * which uses restartable_sequences.
+ */
+void rseq_init_current_thread(void);
+
+#endif /* RSEQ_H_ */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/