github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Parser/Lexer.cpp (about) 1 //===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===// 2 // 3 // Copyright 2019 The MLIR Authors. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // ============================================================================= 17 // 18 // This file implements the lexer for the MLIR textual form. 19 // 20 //===----------------------------------------------------------------------===// 21 22 #include "Lexer.h" 23 #include "mlir/IR/Diagnostics.h" 24 #include "mlir/IR/Identifier.h" 25 #include "mlir/IR/Location.h" 26 #include "mlir/IR/MLIRContext.h" 27 #include "llvm/Support/SourceMgr.h" 28 using namespace mlir; 29 30 using llvm::SMLoc; 31 using llvm::SourceMgr; 32 33 // Returns true if 'c' is an allowable puncuation character: [$._-] 34 // Returns false otherwise. 35 static bool isPunct(char c) { 36 return c == '$' || c == '.' || c == '_' || c == '-'; 37 } 38 39 Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context) 40 : sourceMgr(sourceMgr), context(context) { 41 auto bufferID = sourceMgr.getMainFileID(); 42 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer(); 43 curPtr = curBuffer.begin(); 44 } 45 46 /// Encode the specified source location information into an attribute for 47 /// attachment to the IR. 48 Location Lexer::getEncodedSourceLocation(llvm::SMLoc loc) { 49 auto &sourceMgr = getSourceMgr(); 50 unsigned mainFileID = sourceMgr.getMainFileID(); 51 auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID); 52 auto *buffer = sourceMgr.getMemoryBuffer(mainFileID); 53 54 return FileLineColLoc::get(buffer->getBufferIdentifier(), lineAndColumn.first, 55 lineAndColumn.second, context); 56 } 57 58 /// emitError - Emit an error message and return an Token::error token. 59 Token Lexer::emitError(const char *loc, const Twine &message) { 60 mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)), 61 message); 62 return formToken(Token::error, loc); 63 } 64 65 Token Lexer::lexToken() { 66 // Ignore whitespace. 67 while (true) { 68 switch (*curPtr) { 69 case ' ': 70 case '\t': 71 case '\n': 72 case '\r': 73 ++curPtr; 74 continue; 75 default: 76 // Terminate loop on non-whitespace, including either an embedded or 77 // final terminating nul character that llvm::MemoryBuffer guarantees 78 // will be there. 79 break; 80 } 81 break; 82 } 83 84 const char *tokStart = curPtr; 85 switch (*curPtr++) { 86 default: 87 // Handle bare identifiers. 88 if (isalpha(curPtr[-1])) 89 return lexBareIdentifierOrKeyword(tokStart); 90 91 // Unknown character, emit an error. 92 return emitError(tokStart, "unexpected character"); 93 94 case '_': 95 // Handle bare identifiers. 96 return lexBareIdentifierOrKeyword(tokStart); 97 98 case 0: 99 // This may either be a nul character in the source file or may be the EOF 100 // marker that llvm::MemoryBuffer guarantees will be there. 101 if (curPtr - 1 == curBuffer.end()) 102 return formToken(Token::eof, tokStart); 103 104 LLVM_FALLTHROUGH; 105 case ':': 106 return formToken(Token::colon, tokStart); 107 case ',': 108 return formToken(Token::comma, tokStart); 109 case '.': 110 return lexEllipsis(tokStart); 111 case '(': 112 return formToken(Token::l_paren, tokStart); 113 case ')': 114 return formToken(Token::r_paren, tokStart); 115 case '{': 116 return formToken(Token::l_brace, tokStart); 117 case '}': 118 return formToken(Token::r_brace, tokStart); 119 case '[': 120 return formToken(Token::l_square, tokStart); 121 case ']': 122 return formToken(Token::r_square, tokStart); 123 case '<': 124 return formToken(Token::less, tokStart); 125 case '>': 126 return formToken(Token::greater, tokStart); 127 case '=': 128 return formToken(Token::equal, tokStart); 129 130 case '+': 131 return formToken(Token::plus, tokStart); 132 case '*': 133 return formToken(Token::star, tokStart); 134 case '-': 135 if (*curPtr == '>') { 136 ++curPtr; 137 return formToken(Token::arrow, tokStart); 138 } 139 return formToken(Token::minus, tokStart); 140 141 case '?': 142 return formToken(Token::question, tokStart); 143 144 case '/': 145 if (*curPtr == '/') 146 return lexComment(); 147 return emitError(tokStart, "unexpected character"); 148 149 case '@': 150 return lexAtIdentifier(tokStart); 151 152 case '!': 153 LLVM_FALLTHROUGH; 154 case '^': 155 LLVM_FALLTHROUGH; 156 case '#': 157 LLVM_FALLTHROUGH; 158 case '%': 159 return lexPrefixedIdentifier(tokStart); 160 case '"': 161 return lexString(tokStart); 162 163 case '0': 164 case '1': 165 case '2': 166 case '3': 167 case '4': 168 case '5': 169 case '6': 170 case '7': 171 case '8': 172 case '9': 173 return lexNumber(tokStart); 174 } 175 } 176 177 /// Lex an '@foo' identifier. 178 /// 179 /// symbol-ref-id ::= `@` bare-id 180 /// 181 Token Lexer::lexAtIdentifier(const char *tokStart) { 182 // These always start with a letter or underscore. 183 auto cur = *curPtr++; 184 if (!isalpha(cur) && cur != '_') 185 return emitError(curPtr - 1, 186 "@ identifier expected to start with letter or '_'"); 187 188 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' || 189 *curPtr == '$' || *curPtr == '.') 190 ++curPtr; 191 return formToken(Token::at_identifier, tokStart); 192 } 193 194 /// Lex a bare identifier or keyword that starts with a letter. 195 /// 196 /// bare-id ::= (letter|[_]) (letter|digit|[_$.])* 197 /// integer-type ::= `i[1-9][0-9]*` 198 /// 199 Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) { 200 // Match the rest of the identifier regex: [0-9a-zA-Z_.$]* 201 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' || 202 *curPtr == '$' || *curPtr == '.') 203 ++curPtr; 204 205 // Check to see if this identifier is a keyword. 206 StringRef spelling(tokStart, curPtr - tokStart); 207 208 // Check for i123. 209 if (tokStart[0] == 'i') { 210 bool allDigits = true; 211 for (auto c : spelling.drop_front()) 212 allDigits &= isdigit(c) != 0; 213 if (allDigits && spelling.size() != 1) 214 return Token(Token::inttype, spelling); 215 } 216 217 Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling) 218 #define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING) 219 #include "TokenKinds.def" 220 .Default(Token::bare_identifier); 221 222 return Token(kind, spelling); 223 } 224 225 /// Lex a comment line, starting with a semicolon. 226 /// 227 /// TODO: add a regex for comments here and to the spec. 228 /// 229 Token Lexer::lexComment() { 230 // Advance over the second '/' in a '//' comment. 231 assert(*curPtr == '/'); 232 ++curPtr; 233 234 while (true) { 235 switch (*curPtr++) { 236 case '\n': 237 case '\r': 238 // Newline is end of comment. 239 return lexToken(); 240 case 0: 241 // If this is the end of the buffer, end the comment. 242 if (curPtr - 1 == curBuffer.end()) { 243 --curPtr; 244 return lexToken(); 245 } 246 LLVM_FALLTHROUGH; 247 default: 248 // Skip over other characters. 249 break; 250 } 251 } 252 } 253 254 /// Lex an ellipsis. 255 /// 256 /// ellipsis ::= '...' 257 /// 258 Token Lexer::lexEllipsis(const char *tokStart) { 259 assert(curPtr[-1] == '.'); 260 261 if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.') 262 return emitError(curPtr, "expected three consecutive dots for an ellipsis"); 263 264 curPtr += 2; 265 return formToken(Token::ellipsis, tokStart); 266 } 267 268 /// Lex a number literal. 269 /// 270 /// integer-literal ::= digit+ | `0x` hex_digit+ 271 /// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)? 272 /// 273 Token Lexer::lexNumber(const char *tokStart) { 274 assert(isdigit(curPtr[-1])); 275 276 // Handle the hexadecimal case. 277 if (curPtr[-1] == '0' && *curPtr == 'x') { 278 // If we see stuff like 0xi32, this is a literal `0` follwed by an 279 // identifier `xi32`, stop after `0`. 280 if (!isxdigit(curPtr[1])) 281 return formToken(Token::integer, tokStart); 282 283 curPtr += 2; 284 while (isxdigit(*curPtr)) 285 ++curPtr; 286 287 return formToken(Token::integer, tokStart); 288 } 289 290 // Handle the normal decimal case. 291 while (isdigit(*curPtr)) 292 ++curPtr; 293 294 if (*curPtr != '.') 295 return formToken(Token::integer, tokStart); 296 ++curPtr; 297 298 // Skip over [0-9]*([eE][-+]?[0-9]+)? 299 while (isdigit(*curPtr)) 300 ++curPtr; 301 302 if (*curPtr == 'e' || *curPtr == 'E') { 303 if (isdigit(static_cast<unsigned char>(curPtr[1])) || 304 ((curPtr[1] == '-' || curPtr[1] == '+') && 305 isdigit(static_cast<unsigned char>(curPtr[2])))) { 306 curPtr += 2; 307 while (isdigit(*curPtr)) 308 ++curPtr; 309 } 310 } 311 return formToken(Token::floatliteral, tokStart); 312 } 313 314 /// Lex an identifier that starts with a prefix followed by suffix-id. 315 /// 316 /// affine-map-id ::= `#` suffix-id 317 /// ssa-id ::= '%' suffix-id 318 /// block-id ::= '^' suffix-id 319 /// type-id ::= '!' suffix-id 320 /// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)* 321 /// id-punct ::= `$` | `.` | `_` | `-` 322 /// 323 Token Lexer::lexPrefixedIdentifier(const char *tokStart) { 324 Token::Kind kind; 325 StringRef errorKind; 326 switch (*tokStart) { 327 case '#': 328 kind = Token::hash_identifier; 329 errorKind = "invalid attribute name"; 330 break; 331 case '%': 332 kind = Token::percent_identifier; 333 errorKind = "invalid SSA name"; 334 break; 335 case '^': 336 kind = Token::caret_identifier; 337 errorKind = "invalid block name"; 338 break; 339 case '!': 340 kind = Token::exclamation_identifier; 341 errorKind = "invalid type identifier"; 342 break; 343 default: 344 llvm_unreachable("invalid caller"); 345 } 346 347 // Parse suffix-id. 348 if (isdigit(*curPtr)) { 349 // If suffix-id starts with a digit, the rest must be digits. 350 while (isdigit(*curPtr)) { 351 ++curPtr; 352 } 353 } else if (isalpha(*curPtr) || isPunct(*curPtr)) { 354 do { 355 ++curPtr; 356 } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr)); 357 } else { 358 return emitError(curPtr - 1, errorKind); 359 } 360 361 return formToken(kind, tokStart); 362 } 363 364 /// Lex a string literal. 365 /// 366 /// string-literal ::= '"' [^"\n\f\v\r]* '"' 367 /// 368 /// TODO: define escaping rules. 369 Token Lexer::lexString(const char *tokStart) { 370 assert(curPtr[-1] == '"'); 371 372 while (1) { 373 switch (*curPtr++) { 374 case '"': 375 return formToken(Token::string, tokStart); 376 case 0: 377 // If this is a random nul character in the middle of a string, just 378 // include it. If it is the end of file, then it is an error. 379 if (curPtr - 1 != curBuffer.end()) 380 continue; 381 LLVM_FALLTHROUGH; 382 case '\n': 383 case '\v': 384 case '\f': 385 return emitError(curPtr - 1, "expected '\"' in string literal"); 386 case '\\': 387 // Handle explicitly a few escapes. 388 if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't') 389 ++curPtr; 390 else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1])) 391 // Support \xx for two hex digits. 392 curPtr += 2; 393 else 394 return emitError(curPtr - 1, "unknown escape in string literal"); 395 continue; 396 397 default: 398 continue; 399 } 400 } 401 }