[RFC PATCH v2 12/19] x86: Implement the Memory Table feature to store arbitrary per-page data

From: Mickaël Salaün
Date: Sun Nov 12 2023 - 21:25:50 EST


From: Madhavan T. Venkataraman <madvenka@xxxxxxxxxxxxxxxxxxx>

This feature can be used by a consumer to associate any arbitrary
pointer with a physical page. The feature implements a page table format
that mirrors the hardware page table. A leaf entry in the table points
to consumer data for that page.

The page table format has these advantages:

- The format allows for a sparse representation. This is useful since
the physical address space can be large and is typically sparsely
populated in a system.

- A consumer of this feature can choose to populate data just for the
pages he is interested in.

- Information can be stored for large pages, if a consumer wishes.

For instance, for Heki, the guest kernel uses this to create permissions
counters for each guest physical page. The permissions counters reflects
the collective permissions for a guest physical page across all mappings
to that page. This allows the guest to request the hypervisor to set
only the necessary permissions for a guest physical page in the EPT
(instead of RWX).

This feature could also be used to improve the KVM's memory attribute
and the write page tracking.

We will support large page entries in mem_table in a future version
thanks to extra mem_table_ops's merge() and split() operations.

Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: H. Peter Anvin <hpa@xxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Kees Cook <keescook@xxxxxxxxxxxx>
Cc: Madhavan T. Venkataraman <madvenka@xxxxxxxxxxxxxxxxxxx>
Cc: Mickaël Salaün <mic@xxxxxxxxxxx>
Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Cc: Sean Christopherson <seanjc@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
Cc: Wanpeng Li <wanpengli@xxxxxxxxxxx>
Signed-off-by: Madhavan T. Venkataraman <madvenka@xxxxxxxxxxxxxxxxxxx>
---

Changes since v1:
* New patch and new file: kernel/mem_table.c
---
arch/x86/kernel/setup.c | 2 +
include/linux/heki.h | 1 +
include/linux/mem_table.h | 55 ++++++++++
kernel/Makefile | 2 +
kernel/mem_table.c | 219 ++++++++++++++++++++++++++++++++++++++
5 files changed, 279 insertions(+)
create mode 100644 include/linux/mem_table.h
create mode 100644 kernel/mem_table.c

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b098b1fa2470..e7ae46953ae4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -25,6 +25,7 @@
#include <linux/static_call.h>
#include <linux/swiotlb.h>
#include <linux/random.h>
+#include <linux/mem_table.h>

#include <uapi/linux/mount.h>

@@ -1315,6 +1316,7 @@ void __init setup_arch(char **cmdline_p)
#endif

unwind_init();
+ mem_table_init(PG_LEVEL_4K);
}

#ifdef CONFIG_X86_32
diff --git a/include/linux/heki.h b/include/linux/heki.h
index 89cc9273a968..9b0c966c50d1 100644
--- a/include/linux/heki.h
+++ b/include/linux/heki.h
@@ -15,6 +15,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/printk.h>
+#include <linux/slab.h>

#ifdef CONFIG_HEKI

diff --git a/include/linux/mem_table.h b/include/linux/mem_table.h
new file mode 100644
index 000000000000..738bf12309f3
--- /dev/null
+++ b/include/linux/mem_table.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Memory table feature - Definitions.
+ *
+ * Copyright © 2023 Microsoft Corporation.
+ */
+
+#ifndef __MEM_TABLE_H__
+#define __MEM_TABLE_H__
+
+/* clang-format off */
+
+/*
+ * The MEM_TABLE bit is set on entries that point to an intermediate table.
+ * So, this bit is reserved. This means that pointers to consumer data must
+ * be at least two-byte aligned (so the MEM_TABLE bit is 0).
+ */
+#define MEM_TABLE BIT(0)
+#define IS_LEAF(entry) !((uintptr_t)entry & MEM_TABLE)
+
+/* clang-format on */
+
+/*
+ * A memory table is arranged exactly like a page table. The memory table
+ * configuration reflects the hardware page table configuration.
+ */
+
+/* Parameters at each level of the memory table hierarchy. */
+struct mem_table_level {
+ unsigned int number;
+ unsigned int nentries;
+ unsigned int shift;
+ unsigned int mask;
+};
+
+struct mem_table {
+ struct mem_table_level *level;
+ struct mem_table_ops *ops;
+ bool changed;
+ void *entries[];
+};
+
+/* Operations that need to be supplied by a consumer of memory tables. */
+struct mem_table_ops {
+ void (*free)(void *buf);
+};
+
+void mem_table_init(unsigned int base_level);
+struct mem_table *mem_table_alloc(struct mem_table_ops *ops);
+void mem_table_free(struct mem_table *table);
+void **mem_table_create(struct mem_table *table, phys_addr_t pa);
+void **mem_table_find(struct mem_table *table, phys_addr_t pa,
+ unsigned int *level_num);
+
+#endif /* __MEM_TABLE_H__ */
diff --git a/kernel/Makefile b/kernel/Makefile
index 3947122d618b..dcef03ec5c54 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -131,6 +131,8 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o

+obj-$(CONFIG_SPARSEMEM) += mem_table.o
+
CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
diff --git a/kernel/mem_table.c b/kernel/mem_table.c
new file mode 100644
index 000000000000..280a1b5ddde0
--- /dev/null
+++ b/kernel/mem_table.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Memory table feature.
+ *
+ * This feature can be used by a consumer to associate any arbitrary pointer
+ * with a physical page. The feature implements a page table format that
+ * mirrors the hardware page table. A leaf entry in the table points to
+ * consumer data for that page.
+ *
+ * The page table format has these advantages:
+ *
+ * - The format allows for a sparse representation. This is useful since
+ * the physical address space can be large and is typically sparsely
+ * populated in a system.
+ *
+ * - A consumer of this feature can choose to populate data just for
+ * the pages he is interested in.
+ *
+ * - Information can be stored for large pages, if a consumer wishes.
+ *
+ * For instance, for Heki, the guest kernel uses this to create permissions
+ * counters for each guest physical page. The permissions counters reflects the
+ * collective permissions for a guest physical page across all mappings to that
+ * page. This allows the guest to request the hypervisor to set only the
+ * necessary permissions for a guest physical page in the EPT (instead of RWX).
+ *
+ * Copyright © 2023 Microsoft Corporation.
+ */
+
+/*
+ * Memory table functions use recursion for simplicity. The recursion is bounded
+ * by the number of hardware page table levels.
+ *
+ * Locking is left to the caller of these functions.
+ */
+#include <linux/heki.h>
+#include <linux/mem_table.h>
+#include <linux/pgtable.h>
+
+#define TABLE(entry) ((void *)((uintptr_t)entry & ~MEM_TABLE))
+#define ENTRY(table) ((void *)((uintptr_t)table | MEM_TABLE))
+
+/*
+ * Within this feature, the table levels start from 0. On X86, the base level
+ * is not 0.
+ */
+unsigned int mem_table_base_level __ro_after_init;
+unsigned int mem_table_nlevels __ro_after_init;
+struct mem_table_level mem_table_levels[CONFIG_PGTABLE_LEVELS] __ro_after_init;
+
+void __init mem_table_init(unsigned int base_level)
+{
+ struct mem_table_level *level;
+ unsigned long shift, delta_shift;
+ int physmem_bits;
+ int i, max_levels;
+
+ /*
+ * Compute the actual number of levels present. Compute the parameters
+ * for each level.
+ */
+ shift = ilog2(PAGE_SIZE);
+ physmem_bits = PAGE_SHIFT;
+ max_levels = CONFIG_PGTABLE_LEVELS;
+
+ for (i = 0; i < max_levels && physmem_bits < MAX_PHYSMEM_BITS; i++) {
+ level = &mem_table_levels[i];
+
+ switch (i) {
+ case 0:
+ level->nentries = PTRS_PER_PTE;
+ break;
+ case 1:
+ level->nentries = PTRS_PER_PMD;
+ break;
+ case 2:
+ level->nentries = PTRS_PER_PUD;
+ break;
+ case 3:
+ level->nentries = PTRS_PER_P4D;
+ break;
+ case 4:
+ level->nentries = PTRS_PER_PGD;
+ break;
+ }
+ level->number = i;
+ level->shift = shift;
+ level->mask = level->nentries - 1;
+
+ delta_shift = ilog2(level->nentries);
+ shift += delta_shift;
+ physmem_bits += delta_shift;
+ }
+ mem_table_nlevels = i;
+ mem_table_base_level = base_level;
+}
+
+struct mem_table *mem_table_alloc(struct mem_table_ops *ops)
+{
+ struct mem_table_level *level;
+ struct mem_table *table;
+
+ level = &mem_table_levels[mem_table_nlevels - 1];
+
+ table = kzalloc(struct_size(table, entries, level->nentries),
+ GFP_KERNEL);
+ if (table) {
+ table->level = level;
+ table->ops = ops;
+ return table;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(mem_table_alloc);
+
+static void _mem_table_free(struct mem_table *table)
+{
+ struct mem_table_level *level = table->level;
+ void **entries = table->entries;
+ struct mem_table_ops *ops = table->ops;
+ int i;
+
+ for (i = 0; i < level->nentries; i++) {
+ if (!entries[i])
+ continue;
+ if (IS_LEAF(entries[i])) {
+ /* The consumer frees the pointer. */
+ ops->free(entries[i]);
+ continue;
+ }
+ _mem_table_free(TABLE(entries[i]));
+ }
+ kfree(table);
+}
+
+void mem_table_free(struct mem_table *table)
+{
+ _mem_table_free(table);
+}
+EXPORT_SYMBOL_GPL(mem_table_free);
+
+static void **_mem_table_find(struct mem_table *table, phys_addr_t pa,
+ unsigned int *level_number)
+{
+ struct mem_table_level *level = table->level;
+ void **entries = table->entries;
+ unsigned long i;
+
+ i = (pa >> level->shift) & level->mask;
+
+ *level_number = level->number;
+ if (!entries[i])
+ return NULL;
+
+ if (IS_LEAF(entries[i]))
+ return &entries[i];
+
+ return _mem_table_find(TABLE(entries[i]), pa, level_number);
+}
+
+void **mem_table_find(struct mem_table *table, phys_addr_t pa,
+ unsigned int *level_number)
+{
+ void **entry;
+
+ entry = _mem_table_find(table, pa, level_number);
+ level_number += mem_table_base_level;
+
+ return entry;
+}
+EXPORT_SYMBOL_GPL(mem_table_find);
+
+static void **_mem_table_create(struct mem_table *table, phys_addr_t pa)
+{
+ struct mem_table_level *level = table->level;
+ void **entries = table->entries;
+ unsigned long i;
+
+ table->changed = true;
+ i = (pa >> level->shift) & level->mask;
+
+ if (!level->number) {
+ /*
+ * Reached the lowest level. Return a pointer to the entry
+ * so that the consumer can populate it.
+ */
+ return &entries[i];
+ }
+
+ /*
+ * If the entry is NULL, then create a lower level table and make the
+ * entry point to it. Or, if the entry is a leaf, then we need to
+ * split the entry. In this case as well, create a lower level table
+ * to split the entry.
+ */
+ if (!entries[i] || IS_LEAF(entries[i])) {
+ struct mem_table *next;
+
+ /* Create next level table. */
+ level--;
+ next = kzalloc(struct_size(table, entries, level->nentries),
+ GFP_KERNEL);
+ if (!next)
+ return NULL;
+
+ next->level = level;
+ next->ops = table->ops;
+ next->changed = true;
+ entries[i] = ENTRY(next);
+ }
+
+ return _mem_table_create(TABLE(entries[i]), pa);
+}
+
+void **mem_table_create(struct mem_table *table, phys_addr_t pa)
+{
+ return _mem_table_create(table, pa);
+}
+EXPORT_SYMBOL_GPL(mem_table_create);
--
2.42.1