Re: [PATCH RFC 09/10] unwind: Introduce SFrame user space unwinding

From: Indu Bhagat
Date: Thu Nov 09 2023 - 14:32:23 EST


On 11/8/23 16:41, Josh Poimboeuf wrote:
Some distros have started compiling frame pointers into all their
packages to enable the kernel to do system-wide profiling of user space.
Unfortunately that creates a runtime performance penalty across the
entire system. Using DWARF instead isn't feasible due to complexity and
slowness.

For in-kernel unwinding we solved this problem with the creation of the
ORC unwinder for x86_64. Similarly, for user space the GNU assembler
has created the SFrame format starting with binutils 2.40. SFrame is a
simpler version of .eh_frame which gets placed in the .sframe section.

Add support for unwinding user space using SFrame.

More information about SFrame can be found here:

- https://lwn.net/Articles/932209/
- https://lwn.net/Articles/940686/
- https://sourceware.org/binutils/docs/sframe-spec.html

Signed-off-by: Josh Poimboeuf <jpoimboe@xxxxxxxxxx>
---
arch/Kconfig | 3 +
arch/x86/include/asm/mmu.h | 2 +-
fs/binfmt_elf.c | 46 +++-
include/linux/mm_types.h | 3 +
include/linux/sframe.h | 46 ++++
include/linux/user_unwind.h | 1 +
include/uapi/linux/elf.h | 1 +
include/uapi/linux/prctl.h | 3 +
kernel/fork.c | 10 +
kernel/sys.c | 11 +
kernel/unwind/Makefile | 1 +
kernel/unwind/sframe.c | 414 ++++++++++++++++++++++++++++++++++++
kernel/unwind/sframe.h | 217 +++++++++++++++++++
kernel/unwind/user.c | 15 +-
mm/init-mm.c | 2 +
15 files changed, 768 insertions(+), 7 deletions(-)
create mode 100644 include/linux/sframe.h
create mode 100644 kernel/unwind/sframe.c
create mode 100644 kernel/unwind/sframe.h


Thanks for working on this.

diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
new file mode 100644
index 000000000000..b167c19497e5
--- /dev/null
+++ b/kernel/unwind/sframe.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <linux/sframe.h>
+#include <linux/user_unwind.h>
+
+#include "sframe.h"
+
+#define SFRAME_FILENAME_LEN 32
+
+struct sframe_section {
+ struct rcu_head rcu;
+
+ unsigned long sframe_addr;
+ unsigned long text_addr;
+
+ unsigned long fdes_addr;
+ unsigned long fres_addr;
+ unsigned int fdes_num;
+ signed char ra_off, fp_off;
+};
+
+DEFINE_STATIC_SRCU(sframe_srcu);
+
+#define __SFRAME_GET_USER(out, user_ptr, type) \
+({ \
+ type __tmp; \
+ if (get_user(__tmp, (type *)user_ptr)) \
+ return -EFAULT; \
+ user_ptr += sizeof(__tmp); \
+ out = __tmp; \
+})
+
+#define SFRAME_GET_USER_SIGNED(out, user_ptr, size) \
+({ \
+ switch (size) { \
+ case 1: \
+ __SFRAME_GET_USER(out, user_ptr, s8); \
+ break; \
+ case 2: \
+ __SFRAME_GET_USER(out, user_ptr, s16); \
+ break; \
+ case 4: \
+ __SFRAME_GET_USER(out, user_ptr, s32); \
+ break; \
+ default: \
+ return -EINVAL; \
+ } \
+})
+
+#define SFRAME_GET_USER_UNSIGNED(out, user_ptr, size) \
+({ \
+ switch (size) { \
+ case 1: \
+ __SFRAME_GET_USER(out, user_ptr, u8); \
+ break; \
+ case 2: \
+ __SFRAME_GET_USER(out, user_ptr, u16); \
+ break; \
+ case 4: \
+ __SFRAME_GET_USER(out, user_ptr, u32); \
+ break; \
+ default: \
+ return -EINVAL; \
+ } \
+})
+
+static unsigned char fre_type_to_size(unsigned char fre_type)
+{
+ if (fre_type > 2)
+ return 0;
+ return 1 << fre_type;
+}
+
+static unsigned char offset_size_enum_to_size(unsigned char off_size)
+{
+ if (off_size > 2)
+ return 0;
+ return 1 << off_size;
+}
+
+static int find_fde(struct sframe_section *sec, unsigned long ip,
+ struct sframe_fde *fde)
+{
+ s32 func_off, ip_off;
+ struct sframe_fde __user *first, *last, *mid, *found;
+
+ ip_off = ip - sec->sframe_addr;
+
+ first = (void *)sec->fdes_addr;
+ last = first + sec->fdes_num;
+ while (first <= last) {
+ mid = first + ((last - first) / 2);
+ if (get_user(func_off, (s32 *)mid))
+ return -EFAULT;
+ if (ip_off >= func_off) {
+ found = mid;
+ first = mid + 1;
+ } else
+ last = mid - 1;
+ }
+
+ if (!found)
+ return -EINVAL;
+
+ if (copy_from_user(fde, found, sizeof(*fde)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int find_fre(struct sframe_section *sec, struct sframe_fde *fde,
+ unsigned long ip, struct user_unwind_frame *frame)
+{
+ unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info);
+ unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info);
+ s32 fre_ip_off, cfa_off, ra_off, fp_off, ip_off;
+ unsigned char offset_count, offset_size;
+ unsigned char addr_size;
+ void __user *f, *last_f;
+ u8 fre_info;
+ int i;
+
+ addr_size = fre_type_to_size(fre_type);
+ if (!addr_size)
+ return -EINVAL;
+
+ ip_off = ip - sec->sframe_addr - fde->start_addr;
+
+ f = (void *)sec->fres_addr + fde->fres_off;
+
+ for (i = 0; i < fde->fres_num; i++) {
+
+ SFRAME_GET_USER_UNSIGNED(fre_ip_off, f, addr_size);
+
+ if (fde_type == SFRAME_FDE_TYPE_PCINC) {
+ if (fre_ip_off > ip_off)
+ break;
+ } else {
+ /* SFRAME_FDE_TYPE_PCMASK */
+#if 0 /* sframe v2 */
+ if (ip_off % fde->rep_size < fre_ip_off)
+ break;
+#endif
+ }
+
+ SFRAME_GET_USER_UNSIGNED(fre_info, f, 1);
+
+ offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info);
+ offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info));
+
+ if (!offset_count || !offset_size)
+ return -EINVAL;
+
+ last_f = f;
+ f += offset_count * offset_size;
+ }
+
+ if (!last_f)
+ return -EINVAL;
+
+ f = last_f;
+
+ SFRAME_GET_USER_UNSIGNED(cfa_off, f, offset_size);
+ offset_count--;
+
+ ra_off = sec->ra_off;
+ if (!ra_off) {
+ if (!offset_count--)
+ return -EINVAL;
+ SFRAME_GET_USER_SIGNED(ra_off, f, offset_size);
+ }
+
+ fp_off = sec->fp_off;
+ if (!fp_off && offset_count) {
+ offset_count--;
+ SFRAME_GET_USER_SIGNED(fp_off, f, offset_size);
+ }
+
+ if (offset_count)
+ return -EINVAL;
+
+ frame->cfa_off = cfa_off;
+ frame->ra_off = ra_off;
+ frame->fp_off = fp_off;
+ frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP;
+
+ return 0;
+}
+
+int sframe_find(unsigned long ip, struct user_unwind_frame *frame)
+{
+ struct mm_struct *mm = current->mm;
+ struct sframe_section *sec;
+ struct sframe_fde fde;
+ int srcu_idx;
+ int ret = -EINVAL;
+
+ srcu_idx = srcu_read_lock(&sframe_srcu);
+
+ sec = mtree_load(&mm->sframe_mt, ip);
+ if (!sec) {
+ srcu_read_unlock(&sframe_srcu, srcu_idx);
+ return -EINVAL;
+ }
+
+
+ ret = find_fde(sec, ip, &fde);
+ if (ret)
+ goto err_unlock;
+
+ ret = find_fre(sec, &fde, ip, frame);
+ if (ret)
+ goto err_unlock;
+
+ srcu_read_unlock(&sframe_srcu, srcu_idx);
+ return 0;
+
+err_unlock:
+ srcu_read_unlock(&sframe_srcu, srcu_idx);
+ return ret;
+}
+
+static int get_sframe_file(unsigned long sframe_addr, struct sframe_file *file)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *sframe_vma, *text_vma, *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ mmap_read_lock(mm);
+
+ sframe_vma = vma_lookup(mm, sframe_addr);
+ if (!sframe_vma || !sframe_vma->vm_file)
+ goto err_unlock;
+
+ text_vma = NULL;
+
+ for_each_vma(vmi, vma) {
+ if (vma->vm_file != sframe_vma->vm_file)
+ continue;
+ if (vma->vm_flags & VM_EXEC) {
+ if (text_vma) {
+ /*
+ * Multiple EXEC segments in a single file
+ * aren't currently supported, is that a thing?
+ */
+ WARN_ON_ONCE(1);
+ goto err_unlock;
+ }
+ text_vma = vma;
+ }
+ }
+
+ file->sframe_addr = sframe_addr;
+ file->text_start = text_vma->vm_start;
+ file->text_end = text_vma->vm_end;
+
+ mmap_read_unlock(mm);
+ return 0;
+
+err_unlock:
+ mmap_read_unlock(mm);
+ return -EINVAL;
+}
+
+static int validate_sframe_addrs(struct sframe_file *file)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *text_vma;
+
+ mmap_read_lock(mm);
+
+ if (!vma_lookup(mm, file->sframe_addr))
+ goto err_unlock;
+
+ text_vma = vma_lookup(mm, file->text_start);
+ if (!(text_vma->vm_flags & VM_EXEC))
+ goto err_unlock;
+
+ if (vma_lookup(mm, file->text_end-1) != text_vma)
+ goto err_unlock;
+
+ mmap_read_unlock(mm);
+ return 0;
+
+err_unlock:
+ mmap_read_unlock(mm);
+ return -EINVAL;
+}
+
+int __sframe_add_section(struct sframe_file *file)
+{
+ struct maple_tree *sframe_mt = &current->mm->sframe_mt;
+ struct sframe_section *sec;
+ struct sframe_header shdr;
+ unsigned long header_end;
+ int ret;
+
+ if (copy_from_user(&shdr, (void *)file->sframe_addr, sizeof(shdr)))
+ return -EFAULT;
+
+ if (shdr.preamble.magic != SFRAME_MAGIC ||
+ shdr.preamble.version != SFRAME_VERSION_1 ||
+ (!shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
+ shdr.auxhdr_len || !shdr.num_fdes || !shdr.num_fres ||
+ shdr.fdes_off > shdr.fres_off) {
+ return -EINVAL;
+ }
+

I would say that it will be ideal to start the support with SFRAME_VERSION_2 onwards, if we have a choice.

The structure SFrame FDE in SFRAME_VERSION_1 was unaligned on-disk. We fixed that in SFRAME_VERSION_2 (Binutils 2.41) by adding some padding as you have already noted. For x86_64, its not an issue though, yes.