[PATCH] perf tool: add Rust demangling

From: David Tolnay
Date: Sat Jul 09 2016 - 04:38:49 EST


From: David Tolnay <dtolnay@xxxxxxxxx>
Date: Sat, 9 Jul 2016 00:20:00 -0700
Subject: [PATCH] perf tool: add Rust demangling

Rust demangling is another step after bfd demangling. Add a diagnosis to
identify mangled Rust symbols based on the hash that the Rust mangler
appends as the last path component, as well as other characteristics.
Add a demangler to reconstruct the original symbol.

Signed-off-by: David Tolnay <dtolnay@xxxxxxxxx>
---
tools/perf/util/Build | 1 +
tools/perf/util/demangle-rust.c | 269 ++++++++++++++++++++++++++++++++++++++++
tools/perf/util/demangle-rust.h | 7 ++
tools/perf/util/symbol-elf.c | 8 ++
4 files changed, 285 insertions(+)
create mode 100644 tools/perf/util/demangle-rust.c
create mode 100644 tools/perf/util/demangle-rust.h

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 8c6c8a0..16b59b5 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -108,6 +108,7 @@ libperf-y += scripting-engines/
libperf-$(CONFIG_ZLIB) += zlib.o
libperf-$(CONFIG_LZMA) += lzma.o
libperf-y += demangle-java.o
+libperf-y += demangle-rust.o

ifdef CONFIG_JITDUMP
libperf-$(CONFIG_LIBELF) += jitdump.o
diff --git a/tools/perf/util/demangle-rust.c b/tools/perf/util/demangle-rust.c
new file mode 100644
index 0000000..f9dafa8
--- /dev/null
+++ b/tools/perf/util/demangle-rust.c
@@ -0,0 +1,269 @@
+#include <string.h>
+#include "util.h"
+#include "debug.h"
+
+#include "demangle-rust.h"
+
+/*
+ * Mangled Rust symbols look like this:
+ *
+ * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
+ *
+ * The original symbol is:
+ *
+ * <std::sys::fd::FileDesc as core::ops::Drop>::drop
+ *
+ * The last component of the path is a 64-bit hash in lowercase hex, prefixed
+ * with "h". Rust does not have a global namespace between crates, an illusion
+ * which Rust maintains by using the hash to distinguish things that would
+ * otherwise have the same symbol.
+ *
+ * Any path component not starting with a XID_Start character is prefixed with
+ * "_".
+ *
+ * The following escape sequences are used:
+ *
+ * "," => $C$
+ * "@" => $SP$
+ * "*" => $BP$
+ * "&" => $RF$
+ * "<" => $LT$
+ * ">" => $GT$
+ * "(" => $LP$
+ * ")" => $RP$
+ * " " => $u20$
+ * "'" => $u27$
+ * "[" => $u5b$
+ * "]" => $u5d$
+ * "~" => $u7e$
+ *
+ * A double ".." means "::" and a single "." means "-".
+ *
+ * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
+ */
+
+static const char *hash_prefix = "::h";
+static const size_t hash_prefix_len = 3;
+static const size_t hash_len = 16;
+
+static bool is_prefixed_hash(const char *start);
+static bool looks_like_rust(const char *sym, size_t len);
+static bool unescape(const char **in, char **out, const char *seq, char value);
+
+/*
+ * INPUT:
+ * sym: symbol that has been through BFD-demangling
+ *
+ * This function looks for the following indicators:
+ *
+ * 1. The hash must consist of "h" followed by 16 lowercase hex digits.
+ *
+ * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
+ * hex digits. This is true of 99.9998% of hashes so once in your life you
+ * may see a false negative. The point is to notice path components that
+ * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
+ * this case a false positive (non-Rust symbol has an important path
+ * component removed because it looks like a Rust hash) is worse than a
+ * false negative (the rare Rust symbol is not demangled) so this sets the
+ * balance in favor of false negatives.
+ *
+ * 3. There must be no characters other than a-zA-Z0-9 and _.:$
+ *
+ * 4. There must be no unrecognized $-sign sequences.
+ *
+ * 5. There must be no sequence of three or more dots in a row ("...").
+ */
+bool
+rust_is_mangled(const char *sym)
+{
+ size_t len, len_without_hash;
+
+ if (!sym)
+ return false;
+
+ len = strlen(sym);
+ if (len <= hash_prefix_len + hash_len)
+ /* Not long enough to contain "::h" + hash + something else */
+ return false;
+
+ len_without_hash = len - (hash_prefix_len + hash_len);
+ if (!is_prefixed_hash(sym + len_without_hash))
+ return false;
+
+ return looks_like_rust(sym, len_without_hash);
+}
+
+/*
+ * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
+ * digits must comprise between 5 and 15 (inclusive) distinct digits.
+ */
+static bool is_prefixed_hash(const char *str)
+{
+ const char *end;
+ bool seen[16];
+ size_t i;
+ int count;
+
+ if (strncmp(str, hash_prefix, hash_prefix_len))
+ return false;
+ str += hash_prefix_len;
+
+ memset(seen, false, sizeof(seen));
+ for (end = str + hash_len; str < end; str++)
+ if (*str >= '0' && *str <= '9')
+ seen[*str - '0'] = true;
+ else if (*str >= 'a' && *str <= 'f')
+ seen[*str - 'a' + 10] = true;
+ else
+ return false;
+
+ /* Count how many distinct digits seen */
+ count = 0;
+ for (i = 0; i < 16; i++)
+ if (seen[i])
+ count++;
+
+ return count >= 5 && count <= 15;
+}
+
+static bool looks_like_rust(const char *str, size_t len)
+{
+ const char *end = str + len;
+
+ while (str < end)
+ switch (*str) {
+ case '$':
+ if (!strncmp(str, "$C$", 3))
+ str += 3;
+ else if (!strncmp(str, "$SP$", 4)
+ || !strncmp(str, "$BP$", 4)
+ || !strncmp(str, "$RF$", 4)
+ || !strncmp(str, "$LT$", 4)
+ || !strncmp(str, "$GT$", 4)
+ || !strncmp(str, "$LP$", 4)
+ || !strncmp(str, "$RP$", 4))
+ str += 4;
+ else if (!strncmp(str, "$u20$", 5)
+ || !strncmp(str, "$u27$", 5)
+ || !strncmp(str, "$u5b$", 5)
+ || !strncmp(str, "$u5d$", 5)
+ || !strncmp(str, "$u7e$", 5))
+ str += 5;
+ else
+ return false;
+ break;
+ case '.':
+ /* Do not allow three or more consecutive dots */
+ if (!strncmp(str, "...", 3))
+ return false;
+ /* Fall through */
+ case 'a' ... 'z':
+ case 'A' ... 'Z':
+ case '0' ... '9':
+ case '_':
+ case ':':
+ str++;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * INPUT:
+ * sym: symbol for which rust_is_mangled(sym) returns true
+ *
+ * The input is demangled in-place because the mangled name is always longer
+ * than the demangled one.
+ */
+void
+rust_demangle_sym(char *sym)
+{
+ const char *in;
+ char *out;
+ const char *end;
+
+ if (!sym)
+ return;
+
+ in = sym;
+ out = sym;
+ end = sym + strlen(sym) - (hash_prefix_len + hash_len);
+
+ while (in < end)
+ switch (*in) {
+ case '$':
+ if (!(unescape(&in, &out, "$C$", ',')
+ || unescape(&in, &out, "$SP$", '@')
+ || unescape(&in, &out, "$BP$", '*')
+ || unescape(&in, &out, "$RF$", '&')
+ || unescape(&in, &out, "$LT$", '<')
+ || unescape(&in, &out, "$GT$", '>')
+ || unescape(&in, &out, "$LP$", '(')
+ || unescape(&in, &out, "$RP$", ')')
+ || unescape(&in, &out, "$u20$", ' ')
+ || unescape(&in, &out, "$u27$", '\'')
+ || unescape(&in, &out, "$u5b$", '[')
+ || unescape(&in, &out, "$u5d$", ']')
+ || unescape(&in, &out, "$u7e$", '~'))) {
+ pr_err("demangle-rust: unexpected escape sequence");
+ goto done;
+ }
+ break;
+ case '_':
+ /*
+ * If this is the start of a path component and the next
+ * character is an escape sequence, ignore the
+ * underscore. The mangler inserts an underscore to make
+ * sure the path component begins with a XID_Start
+ * character.
+ */
+ if ((in == sym || in[-1] == ':') && in[1] == '$')
+ in++;
+ else
+ *out++ = *in++;
+ break;
+ case '.':
+ if (in[1] == '.') {
+ /* ".." becomes "::" */
+ *out++ = ':';
+ *out++ = ':';
+ in += 2;
+ } else {
+ /* "." becomes "-" */
+ *out++ = '-';
+ in++;
+ }
+ break;
+ case 'a' ... 'z':
+ case 'A' ... 'Z':
+ case '0' ... '9':
+ case ':':
+ *out++ = *in++;
+ break;
+ default:
+ pr_err("demangle-rust: unexpected character '%c' in symbol\n",
+ *in);
+ goto done;
+ }
+
+done:
+ *out = '\0';
+}
+
+static bool unescape(const char **in, char **out, const char *seq, char value)
+{
+ size_t len = strlen(seq);
+
+ if (strncmp(*in, seq, len))
+ return false;
+
+ **out = value;
+
+ *in += len;
+ *out += 1;
+
+ return true;
+}
diff --git a/tools/perf/util/demangle-rust.h b/tools/perf/util/demangle-rust.h
new file mode 100644
index 0000000..7b41ead
--- /dev/null
+++ b/tools/perf/util/demangle-rust.h
@@ -0,0 +1,7 @@
+#ifndef __PERF_DEMANGLE_RUST
+#define __PERF_DEMANGLE_RUST 1
+
+bool rust_is_mangled(const char *str);
+void rust_demangle_sym(char *str);
+
+#endif /* __PERF_DEMANGLE_RUST */
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 87a297d..5f345d6 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -7,6 +7,7 @@

#include "symbol.h"
#include "demangle-java.h"
+#include "demangle-rust.h"
#include "machine.h"
#include "vdso.h"
#include <symbol/kallsyms.h>
@@ -1072,6 +1073,13 @@ new_symbol:
demangled = bfd_demangle(NULL, elf_name, demangle_flags);
if (demangled == NULL)
demangled = java_demangle_sym(elf_name, JAVA_DEMANGLE_NORET);
+ else if (rust_is_mangled(demangled))
+ /*
+ * Input to Rust demangling is the BFD-demangled
+ * name which it Rust-demangles in place.
+ */
+ rust_demangle_sym(demangled);
+
if (demangled != NULL)
elf_name = demangled;
}
--
2.9.0