Re: [PATCH v7 4/4] perf report: LLVM-based symbol listing

From: Namhyung Kim
Date: Wed May 29 2024 - 00:12:01 EST


On Sun, May 26, 2024 at 11:23 AM Steinar H. Gunderson <sesse@googlecom> wrote:
>
> Similar to the previous LLVM patches, this allows symbol enumeration
> using LLVM instead of libbfd or libelf. This allows distro builds to
> reliably enumerate symbols in e.g. PE binaries as profiled with WINE.
>
> A lot of the logic for e.g. length detection has been carried over
> from the libbfd code.
>
> Signed-off-by: Steinar H. Gunderson <sesse@xxxxxxxxxx>
> ---
> tools/perf/util/llvm-c-helpers.cpp | 124 +++++++++++++++++++++++++++++
> tools/perf/util/llvm-c-helpers.h | 49 ++++++++++++
> tools/perf/util/symbol.c | 54 ++++++++++++-
> tools/perf/util/symbol.h | 3 +
> 4 files changed, 228 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/util/llvm-c-helpers.cpp b/tools/perf/util/llvm-c-helpers.cpp
> index 4070e2d5682f..87caf98cc8f6 100644
> --- a/tools/perf/util/llvm-c-helpers.cpp
> +++ b/tools/perf/util/llvm-c-helpers.cpp
> @@ -9,6 +9,7 @@
> #pragma GCC diagnostic ignored "-Wunused-parameter" /* Needed for LLVM <= 15 */
> #include <llvm/DebugInfo/Symbolize/Symbolize.h>
> #include <llvm/Support/TargetSelect.h>
> +#include <llvm/Object/Binary.h>
> #pragma GCC diagnostic pop
>
> #include <stdio.h>
> @@ -17,6 +18,8 @@
> extern "C" {
> #include <linux/zalloc.h>
> }
> +#include <vector>
> +#include <algorithm>
> #include "symbol_conf.h"
> #include "llvm-c-helpers.h"
>
> @@ -24,6 +27,7 @@ extern "C"
> char *dso__demangle_sym(struct dso *dso, int kmodule, const char *elf_name);
>
> using namespace llvm;
> +using namespace llvm::object;
> using llvm::symbolize::LLVMSymbolizer;
>
> /*
> @@ -194,3 +198,123 @@ char *llvm_name_for_data(struct dso *dso, const char *dso_name, u64 addr)
> dso, res_or_err->Name.c_str(),
> addr, res_or_err->Start);
> }
> +
> +int llvm_load_symbols(const char *debugfile, struct llvm_symbol_list *symbols)
> +{
> + /* NOTE: This nominally does an mmap, despite the scary name. */
> + ErrorOr<std::unique_ptr<MemoryBuffer>> mem_buf_or_err =
> + MemoryBuffer::getFile(debugfile);
> + if (mem_buf_or_err.getError())
> + return -1;
> +
> + Expected<std::unique_ptr<Binary>> binary_or_err(
> + createBinary(mem_buf_or_err.get()->getMemBufferRef(), nullptr));
> + if (!binary_or_err)
> + return -1;
> +
> + /* Find the .text section. */
> + SectionRef text_section;
> + uint64_t text_filepos, image_base;
> + for (SectionRef section :
> + cast<ObjectFile>(*binary_or_err.get()).sections()) {
> + Expected<StringRef> name = section.getName();
> + if (name && *name == ".text") {
> + text_section = section;
> +
> + /*
> + * If we don't find an image base below, we infer the
> + * image base * from the address and file offset of the
> + * .text section.
> + */
> + text_filepos = reinterpret_cast<const char *>(
> + text_section.getContents()->bytes_begin()) -
> + mem_buf_or_err.get()->getBufferStart();
> + image_base = text_section.getAddress() - text_filepos;
> + break;
> + }
> + }
> + if (text_section == SectionRef())
> + /* No .text section, so no symbols (but also not a failure). */
> + return 0;
> +
> + /*
> + * See if we can find an explicit image base pseudosymbol. If so, get
> + * the image base directly from it, then infer the file position of
> + * .text from that (i.e., the opposite inference of the fallback above).
> + */

If it's PE-specific, can we skip this based on the target type or
something?


> + for (SymbolRef symbol :
> + cast<ObjectFile>(*binary_or_err.get()).symbols())
> + if (symbol.getName() &&
> + symbol.getAddress() &&
> + (*symbol.getName() == "__ImageBase" ||
> + *symbol.getName() == "__image_base__")) {
> + image_base = *symbol.getAddress();
> + if (image_base < 0x100000000ULL)
> + /*
> + * PE symbols can only have 4 bytes, so use
> + * .text high bits (if any).
> + */
> + image_base |= text_section.getAddress() &
> + ~0xFFFFFFFFULL;
> + text_filepos = text_section.getAddress() - image_base;
> + break;
> + }
> +
> + symbols->image_base = image_base;
> + symbols->text_end = text_filepos + text_section.getSize();
> +
> + /* Collect all valid symbols. */
> + std::vector<SymbolRef> all_symbols;
> + for (SymbolRef symbol :
> + cast<ObjectFile>(*binary_or_err.get()).symbols())
> + if (symbol.getName() && symbol.getFlags() &&
> + symbol.getAddress() && symbol.getSection())
> + all_symbols.push_back(symbol);
> + symbols->num_symbols = all_symbols.size();
> + symbols->symbols = (struct llvm_symbol *)calloc(
> + all_symbols.size(), sizeof(struct llvm_symbol));
> + if (symbols->symbols == nullptr)
> + return -1;

I'm not sure if it's compatible with what we do in the
dso__load_sym_internal(). Probably ARM and PPC folks
should check if it works well.

Thanks,
Namhyung


> +
> + /*
> + * Symbols don't normally come with lengths, so we'll infer them
> + * from what comes after the symbol address-wise. There is some
> + * extra logic around zero-length symbols and deduplication,
> + * which the caller will do for us (it's shared with other backends).
> + */
> + std::sort(all_symbols.begin(), all_symbols.end(),
> + [](const SymbolRef &a, const SymbolRef &b) {
> + if (*a.getAddress() != *b.getAddress())
> + return *a.getAddress() < *b.getAddress();
> + return *a.getName() < *b.getName();
> + });
> + for (size_t i = 0; i < all_symbols.size(); ++i) {
> + const SymbolRef &sym = all_symbols[i];
> + llvm_symbol &out_sym = symbols->symbols[i];
> + out_sym.start = *sym.getAddress() - image_base;
> + out_sym.name = (char *)calloc(1, sym.getName()->size() + 1);
> + if (out_sym.name == nullptr) {
> + for (size_t i = 0; i < all_symbols.size(); ++i) {
> + zfree(&symbols->symbols[i].name);
> + }
> + zfree(&symbols->symbols);
> + return -1;
> + }
> + memcpy(out_sym.name, sym.getName()->bytes_begin(),
> + sym.getName()->size());
> + out_sym.global = *sym.getFlags() & SymbolRef::SF_Global;
> + out_sym.weak = *sym.getFlags() & SymbolRef::SF_Weak;
> +
> + SectionRef section = **sym.getSection();
> + uint64_t next_addr;
> + if (i + 1 < all_symbols.size() &&
> + section == **all_symbols[i + 1].getSection())
> + next_addr = *all_symbols[i + 1].getAddress();
> + else
> + next_addr = section.getAddress() + section.getSize();
> +
> + out_sym.len = next_addr - *sym.getAddress();
> + }
> +
> + return 1;
> +}
> diff --git a/tools/perf/util/llvm-c-helpers.h b/tools/perf/util/llvm-c-helpers.h
> index d2b99637a28a..1abbc9ef3dab 100644
> --- a/tools/perf/util/llvm-c-helpers.h
> +++ b/tools/perf/util/llvm-c-helpers.h
> @@ -8,6 +8,7 @@
> */
>
> #include <linux/compiler.h>
> +#include <stddef.h>
>
> #ifdef __cplusplus
> extern "C" {
> @@ -53,6 +54,54 @@ int llvm_addr2line(const char* dso_name,
> char *llvm_name_for_code(struct dso *dso, const char *dso_name, u64 addr);
> char *llvm_name_for_data(struct dso *dso, const char *dso_name, u64 addr);
>
> +struct llvm_symbol {
> + u64 start, len;
> + bool weak, global;
> +
> + /* Must be freed by the caller. */
> + char *name;
> +};
> +
> +struct llvm_symbol_list {
> + /*
> + * The address this dso is loaded at in memory; adding this to
> + * the file position of any symbol will give the address that
> + * symbol is loaded at.
> + */
> + u64 image_base;
> +
> + /*
> + * The byte offset of the end of the .text segment (one-past-the-end)
> + * in the file.
> + */
> + u64 text_end;
> +
> + /*
> + * The actual symbols. Must be freed by the caller, with free().
> + * The string inside each llvm_symbol must also be freed.
> + */
> + size_t num_symbols;
> + struct llvm_symbol *symbols;
> +};
> +
> +/*
> + * Get a list of all symbols in the given file using LLVM. Similarly to
> + * addr2line(), we need to implement this using a helper structure;
> + * LLVM actually has a C API for this, but we can't use it (it doesn't
> + * have an equivalent of getFlags()). So we do the meat of the LLVM-facing
> + * logic here and let the caller make perf symbols out of it.
> + *
> + * Possible return values and their meaning:
> + *
> + * -1: An error occurred.
> + * 0: The file loaded fine, but there were no .text section.
> + * 1: The file loaded successfully.
> + *
> + * For -1 and 0, "symbols" does not contain any meaningful information,
> + * and nothing needs to be freed.
> + */
> +int llvm_load_symbols(const char *debugfile, struct llvm_symbol_list *symbols);
> +
> #ifdef __cplusplus
> }
> #endif
> diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
> index 9e5940b5bc59..69b8c0a280c7 100644
> --- a/tools/perf/util/symbol.c
> +++ b/tools/perf/util/symbol.c
> @@ -22,6 +22,7 @@
> #include "util.h" // lsdir()
> #include "debug.h"
> #include "event.h"
> +#include "llvm-c-helpers.h"
> #include "machine.h"
> #include "map.h"
> #include "symbol.h"
> @@ -1566,6 +1567,50 @@ static int dso__load_perf_map(const char *map_path, struct dso *dso)
> return -1;
> }
>
> +#ifdef HAVE_LIBLLVM_SUPPORT
> +int dso__load_llvm_symbols(struct dso *dso, const char *debugfile)
> +{
> + struct llvm_symbol_list symbols;
> + struct symbol *symbol;
> + int err = llvm_load_symbols(debugfile, &symbols);
> + int ret = -1;
> +
> + if (err == -1)
> + return -1;
> + else if (err == 0)
> + return 0;
> +
> + for (size_t i = 0; i < symbols.num_symbols; ++i) {
> + const struct llvm_symbol *sym = &symbols.symbols[i];
> + int elf_binding;
> +
> + if (sym->weak)
> + elf_binding = STB_WEAK;
> + else if (sym->global)
> + elf_binding = STB_GLOBAL;
> + else
> + elf_binding = STB_LOCAL;
> + symbol = symbol__new(sym->start, sym->len, elf_binding, STT_FUNC,
> + sym->name);
> + if (!symbol)
> + goto out_free;
> +
> + symbols__insert(dso__symbols(dso), symbol);
> + }
> +
> + symbols__fixup_end(dso__symbols(dso), false);
> + symbols__fixup_duplicate(dso__symbols(dso));
> + dso__set_adjust_symbols(dso, true);
> + ret = 0;
> +
> +out_free:
> + for (size_t i = 0; i < symbols.num_symbols; ++i)
> + zfree(&symbols.symbols[i].name);
> + zfree(&symbols.symbols);
> + return ret;
> +}
> +#endif
> +
> #ifdef HAVE_LIBBFD_SUPPORT
> #define PACKAGE 'perf'
> #include <bfd.h>
> @@ -1871,6 +1916,7 @@ int dso__load(struct dso *dso, struct map *map)
> bool next_slot = false;
> bool is_reg;
> bool nsexit;
> + int llvmrc = -1;
> int bfdrc = -1;
> int sirc = -1;
>
> @@ -1899,11 +1945,15 @@ int dso__load(struct dso *dso, struct map *map)
> }
> }
>
> -#ifdef HAVE_LIBBFD_SUPPORT
> +#ifdef HAVE_LIBLLVM_SUPPORT
> if (is_reg)
> + llvmrc = dso__load_llvm_symbols(dso, name);
> +#endif
> +#ifdef HAVE_LIBBFD_SUPPORT
> + if (is_reg && llvmrc < 0)
> bfdrc = dso__load_bfd_symbols(dso, name);
> #endif
> - if (is_reg && bfdrc < 0)
> + if (is_reg && llvmrc < 0 && bfdrc < 0)
> sirc = symsrc__init(ss, dso, name, symtab_type);
>
> if (nsexit)
> diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
> index 3fb5d146d9b1..01256c41467a 100644
> --- a/tools/perf/util/symbol.h
> +++ b/tools/perf/util/symbol.h
> @@ -174,6 +174,9 @@ int symbol__config_symfs(const struct option *opt __maybe_unused,
>
> struct symsrc;
>
> +#ifdef HAVE_LIBLLVM_SUPPORT
> +int dso__load_llvm_symbols(struct dso *dso, const char *debugfile);
> +#endif
> #ifdef HAVE_LIBBFD_SUPPORT
> int dso__load_bfd_symbols(struct dso *dso, const char *debugfile);
> #endif
> --
> 2.45.1
>
>