[PATCH v2 51/59] x86/calldepth: Add ret/call counting for debug

From: Peter Zijlstra
Date: Fri Sep 02 2022 - 10:28:09 EST


From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>

Add a debuigfs mechanism to validate the accounting, e.g. vs. call/ret
balance and to gather statistics about the stuffing to call ratio.

Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
arch/x86/include/asm/nospec-branch.h | 36 +++++++++++++++++++++--
arch/x86/kernel/callthunks.c | 53 +++++++++++++++++++++++++++++++++++
arch/x86/lib/retpoline.S | 7 +++-
3 files changed, 91 insertions(+), 5 deletions(-)

--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -57,6 +57,22 @@
#define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL
#define RET_DEPTH_CREDIT 0xffffffffffffffffULL

+#ifdef CONFIG_CALL_THUNKS_DEBUG
+# define CALL_THUNKS_DEBUG_INC_CALLS \
+ incq %gs:__x86_call_count;
+# define CALL_THUNKS_DEBUG_INC_RETS \
+ incq %gs:__x86_ret_count;
+# define CALL_THUNKS_DEBUG_INC_STUFFS \
+ incq %gs:__x86_stuffs_count;
+# define CALL_THUNKS_DEBUG_INC_CTXSW \
+ incq %gs:__x86_ctxsw_count;
+#else
+# define CALL_THUNKS_DEBUG_INC_CALLS
+# define CALL_THUNKS_DEBUG_INC_RETS
+# define CALL_THUNKS_DEBUG_INC_STUFFS
+# define CALL_THUNKS_DEBUG_INC_CTXSW
+#endif
+
#ifdef CONFIG_CALL_DEPTH_TRACKING
#define CREDIT_CALL_DEPTH \
movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
@@ -72,18 +88,23 @@
#define RESET_CALL_DEPTH_FROM_CALL \
mov $0xfc, %rax; \
shl $56, %rax; \
- movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \
+ CALL_THUNKS_DEBUG_INC_CALLS

#define INCREMENT_CALL_DEPTH \
- sarq $5, %gs:pcpu_hot + X86_call_depth;
+ sarq $5, %gs:pcpu_hot + X86_call_depth; \
+ CALL_THUNKS_DEBUG_INC_CALLS

#define ASM_INCREMENT_CALL_DEPTH \
- sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+ sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \
+ CALL_THUNKS_DEBUG_INC_CALLS

#else
#define CREDIT_CALL_DEPTH
+#define ASM_CREDIT_CALL_DEPTH
#define RESET_CALL_DEPTH
#define INCREMENT_CALL_DEPTH
+#define ASM_INCREMENT_CALL_DEPTH
#define RESET_CALL_DEPTH_FROM_CALL
#endif

@@ -134,7 +155,8 @@
jnz 771b; \
/* barrier for jnz misprediction */ \
lfence; \
- ASM_CREDIT_CALL_DEPTH
+ ASM_CREDIT_CALL_DEPTH \
+ CALL_THUNKS_DEBUG_INC_CTXSW
#else
/*
* i386 doesn't unconditionally have LFENCE, as such it can't
@@ -319,6 +341,12 @@ static inline void x86_set_skl_return_th
{
x86_return_thunk = &__x86_return_skl;
}
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DECLARE_PER_CPU(u64, __x86_call_count);
+DECLARE_PER_CPU(u64, __x86_ret_count);
+DECLARE_PER_CPU(u64, __x86_stuffs_count);
+DECLARE_PER_CPU(u64, __x86_ctxsw_count);
+#endif
#else
static inline void x86_set_skl_return_thunk(void) {}
#endif
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -2,6 +2,7 @@

#define pr_fmt(fmt) "callthunks: " fmt

+#include <linux/debugfs.h>
#include <linux/kallsyms.h>
#include <linux/memory.h>
#include <linux/moduleloader.h>
@@ -35,6 +36,15 @@ static int __init debug_thunks(char *str
}
__setup("debug-callthunks", debug_thunks);

+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DEFINE_PER_CPU(u64, __x86_call_count);
+DEFINE_PER_CPU(u64, __x86_ret_count);
+DEFINE_PER_CPU(u64, __x86_stuffs_count);
+DEFINE_PER_CPU(u64, __x86_ctxsw_count);
+EXPORT_SYMBOL_GPL(__x86_ctxsw_count);
+EXPORT_SYMBOL_GPL(__x86_call_count);
+#endif
+
extern s32 __call_sites[], __call_sites_end[];

struct thunk_desc {
@@ -283,3 +293,46 @@ void noinline callthunks_patch_module_ca
mutex_unlock(&text_mutex);
}
#endif /* CONFIG_MODULES */
+
+#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS)
+static int callthunks_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+
+ seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,",
+ per_cpu(__x86_call_count, cpu),
+ per_cpu(__x86_ret_count, cpu),
+ per_cpu(__x86_stuffs_count, cpu),
+ per_cpu(__x86_ctxsw_count, cpu));
+ return 0;
+}
+
+static int callthunks_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, callthunks_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_ops = {
+ .open = callthunks_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init callthunks_debugfs_init(void)
+{
+ struct dentry *dir;
+ unsigned long cpu;
+
+ dir = debugfs_create_dir("callthunks", NULL);
+ for_each_possible_cpu(cpu) {
+ void *arg = (void *)cpu;
+ char name [10];
+
+ sprintf(name, "cpu%lu", cpu);
+ debugfs_create_file(name, 0644, dir, arg, &dfs_ops);
+ }
+ return 0;
+}
+__initcall(callthunks_debugfs_init);
+#endif
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -203,13 +203,18 @@ EXPORT_SYMBOL(__x86_return_thunk)
.align 64
SYM_FUNC_START(__x86_return_skl)
ANNOTATE_NOENDBR
- /* Keep the hotpath in a 16byte I-fetch */
+ /*
+ * Keep the hotpath in a 16byte I-fetch for the non-debug
+ * case.
+ */
+ CALL_THUNKS_DEBUG_INC_RETS
shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth)
jz 1f
ANNOTATE_UNRET_SAFE
ret
int3
1:
+ CALL_THUNKS_DEBUG_INC_STUFFS
.rept 16
ANNOTATE_INTRA_FUNCTION_CALL
call 2f