[PATCH v7 4/4] perf report: LLVM-based symbol listing

From: Steinar H. Gunderson
Date: Sun May 26 2024 - 14:23:17 EST


Similar to the previous LLVM patches, this allows symbol enumeration
using LLVM instead of libbfd or libelf. This allows distro builds to
reliably enumerate symbols in e.g. PE binaries as profiled with WINE.

A lot of the logic for e.g. length detection has been carried over
from the libbfd code.

Signed-off-by: Steinar H. Gunderson <sesse@xxxxxxxxxx>
---
tools/perf/util/llvm-c-helpers.cpp | 124 +++++++++++++++++++++++++++++
tools/perf/util/llvm-c-helpers.h | 49 ++++++++++++
tools/perf/util/symbol.c | 54 ++++++++++++-
tools/perf/util/symbol.h | 3 +
4 files changed, 228 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/llvm-c-helpers.cpp b/tools/perf/util/llvm-c-helpers.cpp
index 4070e2d5682f..87caf98cc8f6 100644
--- a/tools/perf/util/llvm-c-helpers.cpp
+++ b/tools/perf/util/llvm-c-helpers.cpp
@@ -9,6 +9,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter" /* Needed for LLVM <= 15 */
#include <llvm/DebugInfo/Symbolize/Symbolize.h>
#include <llvm/Support/TargetSelect.h>
+#include <llvm/Object/Binary.h>
#pragma GCC diagnostic pop

#include <stdio.h>
@@ -17,6 +18,8 @@
extern "C" {
#include <linux/zalloc.h>
}
+#include <vector>
+#include <algorithm>
#include "symbol_conf.h"
#include "llvm-c-helpers.h"

@@ -24,6 +27,7 @@ extern "C"
char *dso__demangle_sym(struct dso *dso, int kmodule, const char *elf_name);

using namespace llvm;
+using namespace llvm::object;
using llvm::symbolize::LLVMSymbolizer;

/*
@@ -194,3 +198,123 @@ char *llvm_name_for_data(struct dso *dso, const char *dso_name, u64 addr)
dso, res_or_err->Name.c_str(),
addr, res_or_err->Start);
}
+
+int llvm_load_symbols(const char *debugfile, struct llvm_symbol_list *symbols)
+{
+ /* NOTE: This nominally does an mmap, despite the scary name. */
+ ErrorOr<std::unique_ptr<MemoryBuffer>> mem_buf_or_err =
+ MemoryBuffer::getFile(debugfile);
+ if (mem_buf_or_err.getError())
+ return -1;
+
+ Expected<std::unique_ptr<Binary>> binary_or_err(
+ createBinary(mem_buf_or_err.get()->getMemBufferRef(), nullptr));
+ if (!binary_or_err)
+ return -1;
+
+ /* Find the .text section. */
+ SectionRef text_section;
+ uint64_t text_filepos, image_base;
+ for (SectionRef section :
+ cast<ObjectFile>(*binary_or_err.get()).sections()) {
+ Expected<StringRef> name = section.getName();
+ if (name && *name == ".text") {
+ text_section = section;
+
+ /*
+ * If we don't find an image base below, we infer the
+ * image base * from the address and file offset of the
+ * .text section.
+ */
+ text_filepos = reinterpret_cast<const char *>(
+ text_section.getContents()->bytes_begin()) -
+ mem_buf_or_err.get()->getBufferStart();
+ image_base = text_section.getAddress() - text_filepos;
+ break;
+ }
+ }
+ if (text_section == SectionRef())
+ /* No .text section, so no symbols (but also not a failure). */
+ return 0;
+
+ /*
+ * See if we can find an explicit image base pseudosymbol. If so, get
+ * the image base directly from it, then infer the file position of
+ * .text from that (i.e., the opposite inference of the fallback above).
+ */
+ for (SymbolRef symbol :
+ cast<ObjectFile>(*binary_or_err.get()).symbols())
+ if (symbol.getName() &&
+ symbol.getAddress() &&
+ (*symbol.getName() == "__ImageBase" ||
+ *symbol.getName() == "__image_base__")) {
+ image_base = *symbol.getAddress();
+ if (image_base < 0x100000000ULL)
+ /*
+ * PE symbols can only have 4 bytes, so use
+ * .text high bits (if any).
+ */
+ image_base |= text_section.getAddress() &
+ ~0xFFFFFFFFULL;
+ text_filepos = text_section.getAddress() - image_base;
+ break;
+ }
+
+ symbols->image_base = image_base;
+ symbols->text_end = text_filepos + text_section.getSize();
+
+ /* Collect all valid symbols. */
+ std::vector<SymbolRef> all_symbols;
+ for (SymbolRef symbol :
+ cast<ObjectFile>(*binary_or_err.get()).symbols())
+ if (symbol.getName() && symbol.getFlags() &&
+ symbol.getAddress() && symbol.getSection())
+ all_symbols.push_back(symbol);
+ symbols->num_symbols = all_symbols.size();
+ symbols->symbols = (struct llvm_symbol *)calloc(
+ all_symbols.size(), sizeof(struct llvm_symbol));
+ if (symbols->symbols == nullptr)
+ return -1;
+
+ /*
+ * Symbols don't normally come with lengths, so we'll infer them
+ * from what comes after the symbol address-wise. There is some
+ * extra logic around zero-length symbols and deduplication,
+ * which the caller will do for us (it's shared with other backends).
+ */
+ std::sort(all_symbols.begin(), all_symbols.end(),
+ [](const SymbolRef &a, const SymbolRef &b) {
+ if (*a.getAddress() != *b.getAddress())
+ return *a.getAddress() < *b.getAddress();
+ return *a.getName() < *b.getName();
+ });
+ for (size_t i = 0; i < all_symbols.size(); ++i) {
+ const SymbolRef &sym = all_symbols[i];
+ llvm_symbol &out_sym = symbols->symbols[i];
+ out_sym.start = *sym.getAddress() - image_base;
+ out_sym.name = (char *)calloc(1, sym.getName()->size() + 1);
+ if (out_sym.name == nullptr) {
+ for (size_t i = 0; i < all_symbols.size(); ++i) {
+ zfree(&symbols->symbols[i].name);
+ }
+ zfree(&symbols->symbols);
+ return -1;
+ }
+ memcpy(out_sym.name, sym.getName()->bytes_begin(),
+ sym.getName()->size());
+ out_sym.global = *sym.getFlags() & SymbolRef::SF_Global;
+ out_sym.weak = *sym.getFlags() & SymbolRef::SF_Weak;
+
+ SectionRef section = **sym.getSection();
+ uint64_t next_addr;
+ if (i + 1 < all_symbols.size() &&
+ section == **all_symbols[i + 1].getSection())
+ next_addr = *all_symbols[i + 1].getAddress();
+ else
+ next_addr = section.getAddress() + section.getSize();
+
+ out_sym.len = next_addr - *sym.getAddress();
+ }
+
+ return 1;
+}
diff --git a/tools/perf/util/llvm-c-helpers.h b/tools/perf/util/llvm-c-helpers.h
index d2b99637a28a..1abbc9ef3dab 100644
--- a/tools/perf/util/llvm-c-helpers.h
+++ b/tools/perf/util/llvm-c-helpers.h
@@ -8,6 +8,7 @@
*/

#include <linux/compiler.h>
+#include <stddef.h>

#ifdef __cplusplus
extern "C" {
@@ -53,6 +54,54 @@ int llvm_addr2line(const char* dso_name,
char *llvm_name_for_code(struct dso *dso, const char *dso_name, u64 addr);
char *llvm_name_for_data(struct dso *dso, const char *dso_name, u64 addr);

+struct llvm_symbol {
+ u64 start, len;
+ bool weak, global;
+
+ /* Must be freed by the caller. */
+ char *name;
+};
+
+struct llvm_symbol_list {
+ /*
+ * The address this dso is loaded at in memory; adding this to
+ * the file position of any symbol will give the address that
+ * symbol is loaded at.
+ */
+ u64 image_base;
+
+ /*
+ * The byte offset of the end of the .text segment (one-past-the-end)
+ * in the file.
+ */
+ u64 text_end;
+
+ /*
+ * The actual symbols. Must be freed by the caller, with free().
+ * The string inside each llvm_symbol must also be freed.
+ */
+ size_t num_symbols;
+ struct llvm_symbol *symbols;
+};
+
+/*
+ * Get a list of all symbols in the given file using LLVM. Similarly to
+ * addr2line(), we need to implement this using a helper structure;
+ * LLVM actually has a C API for this, but we can't use it (it doesn't
+ * have an equivalent of getFlags()). So we do the meat of the LLVM-facing
+ * logic here and let the caller make perf symbols out of it.
+ *
+ * Possible return values and their meaning:
+ *
+ * -1: An error occurred.
+ * 0: The file loaded fine, but there were no .text section.
+ * 1: The file loaded successfully.
+ *
+ * For -1 and 0, "symbols" does not contain any meaningful information,
+ * and nothing needs to be freed.
+ */
+int llvm_load_symbols(const char *debugfile, struct llvm_symbol_list *symbols);
+
#ifdef __cplusplus
}
#endif
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 9e5940b5bc59..69b8c0a280c7 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -22,6 +22,7 @@
#include "util.h" // lsdir()
#include "debug.h"
#include "event.h"
+#include "llvm-c-helpers.h"
#include "machine.h"
#include "map.h"
#include "symbol.h"
@@ -1566,6 +1567,50 @@ static int dso__load_perf_map(const char *map_path, struct dso *dso)
return -1;
}

+#ifdef HAVE_LIBLLVM_SUPPORT
+int dso__load_llvm_symbols(struct dso *dso, const char *debugfile)
+{
+ struct llvm_symbol_list symbols;
+ struct symbol *symbol;
+ int err = llvm_load_symbols(debugfile, &symbols);
+ int ret = -1;
+
+ if (err == -1)
+ return -1;
+ else if (err == 0)
+ return 0;
+
+ for (size_t i = 0; i < symbols.num_symbols; ++i) {
+ const struct llvm_symbol *sym = &symbols.symbols[i];
+ int elf_binding;
+
+ if (sym->weak)
+ elf_binding = STB_WEAK;
+ else if (sym->global)
+ elf_binding = STB_GLOBAL;
+ else
+ elf_binding = STB_LOCAL;
+ symbol = symbol__new(sym->start, sym->len, elf_binding, STT_FUNC,
+ sym->name);
+ if (!symbol)
+ goto out_free;
+
+ symbols__insert(dso__symbols(dso), symbol);
+ }
+
+ symbols__fixup_end(dso__symbols(dso), false);
+ symbols__fixup_duplicate(dso__symbols(dso));
+ dso__set_adjust_symbols(dso, true);
+ ret = 0;
+
+out_free:
+ for (size_t i = 0; i < symbols.num_symbols; ++i)
+ zfree(&symbols.symbols[i].name);
+ zfree(&symbols.symbols);
+ return ret;
+}
+#endif
+
#ifdef HAVE_LIBBFD_SUPPORT
#define PACKAGE 'perf'
#include <bfd.h>
@@ -1871,6 +1916,7 @@ int dso__load(struct dso *dso, struct map *map)
bool next_slot = false;
bool is_reg;
bool nsexit;
+ int llvmrc = -1;
int bfdrc = -1;
int sirc = -1;

@@ -1899,11 +1945,15 @@ int dso__load(struct dso *dso, struct map *map)
}
}

-#ifdef HAVE_LIBBFD_SUPPORT
+#ifdef HAVE_LIBLLVM_SUPPORT
if (is_reg)
+ llvmrc = dso__load_llvm_symbols(dso, name);
+#endif
+#ifdef HAVE_LIBBFD_SUPPORT
+ if (is_reg && llvmrc < 0)
bfdrc = dso__load_bfd_symbols(dso, name);
#endif
- if (is_reg && bfdrc < 0)
+ if (is_reg && llvmrc < 0 && bfdrc < 0)
sirc = symsrc__init(ss, dso, name, symtab_type);

if (nsexit)
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 3fb5d146d9b1..01256c41467a 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -174,6 +174,9 @@ int symbol__config_symfs(const struct option *opt __maybe_unused,

struct symsrc;

+#ifdef HAVE_LIBLLVM_SUPPORT
+int dso__load_llvm_symbols(struct dso *dso, const char *debugfile);
+#endif
#ifdef HAVE_LIBBFD_SUPPORT
int dso__load_bfd_symbols(struct dso *dso, const char *debugfile);
#endif
--
2.45.1