github.com/kidsbmilk/gofrontend_all@v0.0.0-20220701224323-6479d5976c5d/go/go-encode-id.cc (about) 1 // go-encode-id.cc -- Go identifier and packagepath encoding/decoding hooks 2 3 // Copyright 2016 The Go Authors. All rights reserved. 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file. 6 7 #include "go-system.h" 8 9 #include "gogo.h" 10 #include "go-location.h" 11 #include "go-linemap.h" 12 #include "go-encode-id.h" 13 #include "lex.h" 14 15 // Return whether the character c can appear in a name that we are 16 // encoding. We only permit ASCII alphanumeric characters. 17 18 static bool 19 char_needs_encoding(char c) 20 { 21 switch (c) 22 { 23 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 24 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 25 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 26 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 27 case 'Y': case 'Z': 28 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 29 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 30 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 31 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 32 case 'y': case 'z': 33 case '0': case '1': case '2': case '3': case '4': 34 case '5': case '6': case '7': case '8': case '9': 35 return false; 36 default: 37 return true; 38 } 39 } 40 41 // Return whether the identifier needs to be translated because it 42 // contains non-ASCII characters. 43 44 bool 45 go_id_needs_encoding(const std::string& str) 46 { 47 for (std::string::const_iterator p = str.begin(); 48 p != str.end(); 49 ++p) 50 if (char_needs_encoding(*p)) 51 return true; 52 return false; 53 } 54 55 // Map from characters to the underscore encoding for them. 56 57 class Special_char_code 58 { 59 public: 60 Special_char_code(); 61 62 // Return the simple underscore encoding for C, or 0 if none. 63 char 64 code_for(unsigned int c) const 65 { 66 if (c <= 127) 67 return this->codes_[c]; 68 return 0; 69 } 70 71 private: 72 // Encodings for characters. 73 char codes_[128]; 74 }; 75 76 // Construct the underscore encoding map. 77 78 Special_char_code::Special_char_code() 79 { 80 memset(this->codes_, 0, sizeof this->codes_); 81 this->codes_['_'] = '_'; 82 this->codes_['.'] = '0'; 83 this->codes_['/'] = '1'; 84 this->codes_['*'] = '2'; 85 this->codes_[','] = '3'; 86 this->codes_['{'] = '4'; 87 this->codes_['}'] = '5'; 88 this->codes_['['] = '6'; 89 this->codes_[']'] = '7'; 90 this->codes_['('] = '8'; 91 this->codes_[')'] = '9'; 92 this->codes_['"'] = 'a'; 93 this->codes_[' '] = 'b'; 94 this->codes_[';'] = 'c'; 95 } 96 97 // The singleton Special_char_code. 98 99 static const Special_char_code special_char_code; 100 101 // Pull the next UTF-8 character out of P and store it in *PC. Return 102 // the number of bytes read. 103 104 static size_t 105 fetch_utf8_char(const char* p, unsigned int* pc) 106 { 107 unsigned char c = *p; 108 if ((c & 0x80) == 0) 109 { 110 *pc = c; 111 return 1; 112 } 113 size_t len = 0; 114 while ((c & 0x80) != 0) 115 { 116 ++len; 117 c <<= 1; 118 } 119 unsigned int rc = *p & ((1 << (7 - len)) - 1); 120 for (size_t i = 1; i < len; i++) 121 { 122 unsigned int u = p[i]; 123 rc <<= 6; 124 rc |= u & 0x3f; 125 } 126 *pc = rc; 127 return len; 128 } 129 130 // Encode an identifier using assembler-friendly characters. The 131 // encoding is described in detail near the end of the long comment at 132 // the start of names.cc. 133 134 std::string 135 go_encode_id(const std::string &id) 136 { 137 if (Lex::is_invalid_identifier(id)) 138 { 139 go_assert(saw_errors()); 140 return id; 141 } 142 143 std::string ret; 144 const char* p = id.c_str(); 145 const char* pend = p + id.length(); 146 147 // We encode a leading digit, to ensure that no identifier starts 148 // with a digit. 149 if (pend > p && p[0] >= '0' && p[0] <= '9') 150 { 151 char buf[8]; 152 snprintf(buf, sizeof buf, "_x%02x", p[0]); 153 ret.append(buf); 154 ++p; 155 } 156 157 while (p < pend) 158 { 159 unsigned int c; 160 size_t len = fetch_utf8_char(p, &c); 161 if (len == 1) 162 { 163 if (!char_needs_encoding(c)) 164 ret.push_back(c); 165 else 166 { 167 char code = special_char_code.code_for(c); 168 if (code != 0) 169 { 170 ret.push_back('_'); 171 ret.push_back(code); 172 } 173 else 174 { 175 char buf[8]; 176 snprintf(buf, sizeof buf, "_x%02x", c); 177 ret.append(buf); 178 } 179 } 180 } 181 else 182 { 183 char buf[16]; 184 if (c < 0x10000) 185 snprintf(buf, sizeof buf, "_u%04x", c); 186 else 187 snprintf(buf, sizeof buf, "_U%08x", c); 188 ret.append(buf); 189 } 190 191 p += len; 192 } 193 194 return ret; 195 } 196 197 // Convert a hex digit string to a unicode codepoint. No checking 198 // to insure that the hex digit is meaningful. 199 200 static unsigned 201 hex_digits_to_unicode_codepoint(const char *digits, unsigned ndig) 202 { 203 unsigned result = 0; 204 for (unsigned i = 0; i < ndig; ++i) { 205 result <<= 4; 206 result |= Lex::hex_val(digits[i]); 207 } 208 return result; 209 } 210 211 // Decode/demangle a mangled string produced by go_encode_id(). Returns 212 // empty string if demangling process fails in some way. At the moment 213 // this routine is unused; there is an equivalent routine in the runtime 214 // used for demangling symbols appearing in stack traces. 215 216 std::string 217 go_decode_id(const std::string &encoded) 218 { 219 std::string ret; 220 const char* p = encoded.c_str(); 221 const char* pend = p + encoded.length(); 222 const Location loc = Linemap::predeclared_location(); 223 224 while (p < pend) 225 { 226 if (*p != '_' || p + 1 == pend) 227 { 228 ret.push_back(*p); 229 p++; 230 continue; 231 } 232 233 switch (p[1]) 234 { 235 case '_': 236 ret.push_back('_'); 237 p += 2; 238 break; 239 case '0': 240 ret.push_back('.'); 241 p += 2; 242 break; 243 case '1': 244 ret.push_back('/'); 245 p += 2; 246 break; 247 case '2': 248 ret.push_back('*'); 249 p += 2; 250 break; 251 case '3': 252 ret.push_back(','); 253 p += 2; 254 break; 255 case '4': 256 ret.push_back('{'); 257 p += 2; 258 break; 259 case '5': 260 ret.push_back('}'); 261 p += 2; 262 break; 263 case '6': 264 ret.push_back('['); 265 p += 2; 266 break; 267 case '7': 268 ret.push_back(']'); 269 p += 2; 270 break; 271 case '8': 272 ret.push_back('('); 273 p += 2; 274 break; 275 case '9': 276 ret.push_back(')'); 277 p += 2; 278 break; 279 case 'a': 280 ret.push_back('"'); 281 p += 2; 282 break; 283 case 'b': 284 ret.push_back(' '); 285 p += 2; 286 break; 287 case 'c': 288 ret.push_back(';'); 289 p += 2; 290 break; 291 case 'x': 292 { 293 const char* digits = p + 2; 294 if (strlen(digits) < 2) 295 return ""; 296 unsigned int rune = hex_digits_to_unicode_codepoint(digits, 2); 297 Lex::append_char(rune, true, &ret, loc); 298 p += 4; 299 } 300 break; 301 case 'u': 302 { 303 const char* digits = p + 2; 304 if (strlen(digits) < 4) 305 return ""; 306 unsigned int rune = hex_digits_to_unicode_codepoint(digits, 4); 307 Lex::append_char(rune, true, &ret, loc); 308 p += 6; 309 } 310 break; 311 case 'U': 312 { 313 const char* digits = p + 2; 314 if (strlen(digits) < 8) 315 return ""; 316 unsigned int rune = hex_digits_to_unicode_codepoint(digits, 8); 317 Lex::append_char(rune, true, &ret, loc); 318 p += 10; 319 } 320 break; 321 default: 322 return ""; 323 } 324 } 325 326 return ret; 327 } 328 329 // Encode a struct field tag. This is only used when we need to 330 // create a type descriptor for an anonymous struct type with field 331 // tags. Underscore encoding will be applied to the returned string. 332 // The tag will appear between curly braces, so that is all we have to 333 // avoid. 334 335 std::string 336 go_mangle_struct_tag(const std::string& tag) 337 { 338 std::string ret; 339 const char* p = tag.c_str(); 340 const char* pend = p + tag.length(); 341 while (p < pend) 342 { 343 unsigned int c; 344 size_t len = fetch_utf8_char(p, &c); 345 if (len > 1) 346 ret.append(p, len); 347 else if (c != '{' && c != '}' && c != '\\') 348 ret.push_back(c); 349 else 350 { 351 ret.push_back('\\'); 352 ret.push_back(c); 353 } 354 p += len; 355 } 356 return ret; 357 }