Linux Perf
demangle-rust.c
Go to the documentation of this file.
1 // SPDX-License-Identifier: GPL-2.0
2 #include <string.h>
3 #include "util.h"
4 #include "debug.h"
5 
6 #include "demangle-rust.h"
7 
8 /*
9  * Mangled Rust symbols look like this:
10  *
11  * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
12  *
13  * The original symbol is:
14  *
15  * <std::sys::fd::FileDesc as core::ops::Drop>::drop
16  *
17  * The last component of the path is a 64-bit hash in lowercase hex, prefixed
18  * with "h". Rust does not have a global namespace between crates, an illusion
19  * which Rust maintains by using the hash to distinguish things that would
20  * otherwise have the same symbol.
21  *
22  * Any path component not starting with a XID_Start character is prefixed with
23  * "_".
24  *
25  * The following escape sequences are used:
26  *
27  * "," => $C$
28  * "@" => $SP$
29  * "*" => $BP$
30  * "&" => $RF$
31  * "<" => $LT$
32  * ">" => $GT$
33  * "(" => $LP$
34  * ")" => $RP$
35  * " " => $u20$
36  * "'" => $u27$
37  * "[" => $u5b$
38  * "]" => $u5d$
39  * "~" => $u7e$
40  *
41  * A double ".." means "::" and a single "." means "-".
42  *
43  * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
44  */
45 
46 static const char *hash_prefix = "::h";
47 static const size_t hash_prefix_len = 3;
48 static const size_t hash_len = 16;
49 
50 static bool is_prefixed_hash(const char *start);
51 static bool looks_like_rust(const char *sym, size_t len);
52 static bool unescape(const char **in, char **out, const char *seq, char value);
53 
54 /*
55  * INPUT:
56  * sym: symbol that has been through BFD-demangling
57  *
58  * This function looks for the following indicators:
59  *
60  * 1. The hash must consist of "h" followed by 16 lowercase hex digits.
61  *
62  * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
63  * hex digits. This is true of 99.9998% of hashes so once in your life you
64  * may see a false negative. The point is to notice path components that
65  * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
66  * this case a false positive (non-Rust symbol has an important path
67  * component removed because it looks like a Rust hash) is worse than a
68  * false negative (the rare Rust symbol is not demangled) so this sets the
69  * balance in favor of false negatives.
70  *
71  * 3. There must be no characters other than a-zA-Z0-9 and _.:$
72  *
73  * 4. There must be no unrecognized $-sign sequences.
74  *
75  * 5. There must be no sequence of three or more dots in a row ("...").
76  */
77 bool
78 rust_is_mangled(const char *sym)
79 {
80  size_t len, len_without_hash;
81 
82  if (!sym)
83  return false;
84 
85  len = strlen(sym);
86  if (len <= hash_prefix_len + hash_len)
87  /* Not long enough to contain "::h" + hash + something else */
88  return false;
89 
90  len_without_hash = len - (hash_prefix_len + hash_len);
91  if (!is_prefixed_hash(sym + len_without_hash))
92  return false;
93 
94  return looks_like_rust(sym, len_without_hash);
95 }
96 
97 /*
98  * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
99  * digits must comprise between 5 and 15 (inclusive) distinct digits.
100  */
101 static bool is_prefixed_hash(const char *str)
102 {
103  const char *end;
104  bool seen[16];
105  size_t i;
106  int count;
107 
108  if (strncmp(str, hash_prefix, hash_prefix_len))
109  return false;
110  str += hash_prefix_len;
111 
112  memset(seen, false, sizeof(seen));
113  for (end = str + hash_len; str < end; str++)
114  if (*str >= '0' && *str <= '9')
115  seen[*str - '0'] = true;
116  else if (*str >= 'a' && *str <= 'f')
117  seen[*str - 'a' + 10] = true;
118  else
119  return false;
120 
121  /* Count how many distinct digits seen */
122  count = 0;
123  for (i = 0; i < 16; i++)
124  if (seen[i])
125  count++;
126 
127  return count >= 5 && count <= 15;
128 }
129 
130 static bool looks_like_rust(const char *str, size_t len)
131 {
132  const char *end = str + len;
133 
134  while (str < end)
135  switch (*str) {
136  case '$':
137  if (!strncmp(str, "$C$", 3))
138  str += 3;
139  else if (!strncmp(str, "$SP$", 4)
140  || !strncmp(str, "$BP$", 4)
141  || !strncmp(str, "$RF$", 4)
142  || !strncmp(str, "$LT$", 4)
143  || !strncmp(str, "$GT$", 4)
144  || !strncmp(str, "$LP$", 4)
145  || !strncmp(str, "$RP$", 4))
146  str += 4;
147  else if (!strncmp(str, "$u20$", 5)
148  || !strncmp(str, "$u27$", 5)
149  || !strncmp(str, "$u5b$", 5)
150  || !strncmp(str, "$u5d$", 5)
151  || !strncmp(str, "$u7e$", 5))
152  str += 5;
153  else
154  return false;
155  break;
156  case '.':
157  /* Do not allow three or more consecutive dots */
158  if (!strncmp(str, "...", 3))
159  return false;
160  /* Fall through */
161  case 'a' ... 'z':
162  case 'A' ... 'Z':
163  case '0' ... '9':
164  case '_':
165  case ':':
166  str++;
167  break;
168  default:
169  return false;
170  }
171 
172  return true;
173 }
174 
175 /*
176  * INPUT:
177  * sym: symbol for which rust_is_mangled(sym) returns true
178  *
179  * The input is demangled in-place because the mangled name is always longer
180  * than the demangled one.
181  */
182 void
184 {
185  const char *in;
186  char *out;
187  const char *end;
188 
189  if (!sym)
190  return;
191 
192  in = sym;
193  out = sym;
194  end = sym + strlen(sym) - (hash_prefix_len + hash_len);
195 
196  while (in < end)
197  switch (*in) {
198  case '$':
199  if (!(unescape(&in, &out, "$C$", ',')
200  || unescape(&in, &out, "$SP$", '@')
201  || unescape(&in, &out, "$BP$", '*')
202  || unescape(&in, &out, "$RF$", '&')
203  || unescape(&in, &out, "$LT$", '<')
204  || unescape(&in, &out, "$GT$", '>')
205  || unescape(&in, &out, "$LP$", '(')
206  || unescape(&in, &out, "$RP$", ')')
207  || unescape(&in, &out, "$u20$", ' ')
208  || unescape(&in, &out, "$u27$", '\'')
209  || unescape(&in, &out, "$u5b$", '[')
210  || unescape(&in, &out, "$u5d$", ']')
211  || unescape(&in, &out, "$u7e$", '~'))) {
212  pr_err("demangle-rust: unexpected escape sequence");
213  goto done;
214  }
215  break;
216  case '_':
217  /*
218  * If this is the start of a path component and the next
219  * character is an escape sequence, ignore the
220  * underscore. The mangler inserts an underscore to make
221  * sure the path component begins with a XID_Start
222  * character.
223  */
224  if ((in == sym || in[-1] == ':') && in[1] == '$')
225  in++;
226  else
227  *out++ = *in++;
228  break;
229  case '.':
230  if (in[1] == '.') {
231  /* ".." becomes "::" */
232  *out++ = ':';
233  *out++ = ':';
234  in += 2;
235  } else {
236  /* "." becomes "-" */
237  *out++ = '-';
238  in++;
239  }
240  break;
241  case 'a' ... 'z':
242  case 'A' ... 'Z':
243  case '0' ... '9':
244  case ':':
245  *out++ = *in++;
246  break;
247  default:
248  pr_err("demangle-rust: unexpected character '%c' in symbol\n",
249  *in);
250  goto done;
251  }
252 
253 done:
254  *out = '\0';
255 }
256 
257 static bool unescape(const char **in, char **out, const char *seq, char value)
258 {
259  size_t len = strlen(seq);
260 
261  if (strncmp(*in, seq, len))
262  return false;
263 
264  **out = value;
265 
266  *in += len;
267  *out += 1;
268 
269  return true;
270 }
static const size_t hash_len
Definition: demangle-rust.c:48
int value
Definition: python.c:1143
void rust_demangle_sym(char *sym)
static bool is_prefixed_hash(const char *start)
#define pr_err(fmt,...)
Definition: json.h:21
x86 movsq based memset() in arch/x86/lib/memset_64.S") MEMSET_FN(memset_erms
static const char * hash_prefix
Definition: demangle-rust.c:46
static int str(yyscan_t scanner, int token)
static bool unescape(const char **in, char **out, const char *seq, char value)
bool rust_is_mangled(const char *sym)
Definition: demangle-rust.c:78
static bool looks_like_rust(const char *sym, size_t len)
u64 start
Definition: hists_common.c:25
static int sym(yyscan_t scanner, int type, int config)
static const size_t hash_prefix_len
Definition: demangle-rust.c:47
static bool done
Definition: futex-hash.c:35