github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/mlir/lib/Parser/Lexer.cpp (about)

     1  //===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
     2  //
     3  // Copyright 2019 The MLIR Authors.
     4  //
     5  // Licensed under the Apache License, Version 2.0 (the "License");
     6  // you may not use this file except in compliance with the License.
     7  // You may obtain a copy of the License at
     8  //
     9  //   http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  // =============================================================================
    17  //
    18  // This file implements the lexer for the MLIR textual form.
    19  //
    20  //===----------------------------------------------------------------------===//
    21  
    22  #include "Lexer.h"
    23  #include "mlir/IR/Diagnostics.h"
    24  #include "mlir/IR/Identifier.h"
    25  #include "mlir/IR/Location.h"
    26  #include "mlir/IR/MLIRContext.h"
    27  #include "llvm/Support/SourceMgr.h"
    28  using namespace mlir;
    29  
    30  using llvm::SMLoc;
    31  using llvm::SourceMgr;
    32  
    33  // Returns true if 'c' is an allowable puncuation character: [$._-]
    34  // Returns false otherwise.
    35  static bool isPunct(char c) {
    36    return c == '$' || c == '.' || c == '_' || c == '-';
    37  }
    38  
    39  Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
    40      : sourceMgr(sourceMgr), context(context) {
    41    auto bufferID = sourceMgr.getMainFileID();
    42    curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
    43    curPtr = curBuffer.begin();
    44  }
    45  
    46  /// Encode the specified source location information into an attribute for
    47  /// attachment to the IR.
    48  Location Lexer::getEncodedSourceLocation(llvm::SMLoc loc) {
    49    auto &sourceMgr = getSourceMgr();
    50    unsigned mainFileID = sourceMgr.getMainFileID();
    51    auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID);
    52    auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);
    53  
    54    return FileLineColLoc::get(buffer->getBufferIdentifier(), lineAndColumn.first,
    55                               lineAndColumn.second, context);
    56  }
    57  
    58  /// emitError - Emit an error message and return an Token::error token.
    59  Token Lexer::emitError(const char *loc, const Twine &message) {
    60    mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
    61                    message);
    62    return formToken(Token::error, loc);
    63  }
    64  
    65  Token Lexer::lexToken() {
    66    // Ignore whitespace.
    67    while (true) {
    68      switch (*curPtr) {
    69      case ' ':
    70      case '\t':
    71      case '\n':
    72      case '\r':
    73        ++curPtr;
    74        continue;
    75      default:
    76        // Terminate loop on non-whitespace, including either an embedded or
    77        // final terminating nul character that llvm::MemoryBuffer guarantees
    78        // will be there.
    79        break;
    80      }
    81      break;
    82    }
    83  
    84    const char *tokStart = curPtr;
    85    switch (*curPtr++) {
    86    default:
    87      // Handle bare identifiers.
    88      if (isalpha(curPtr[-1]))
    89        return lexBareIdentifierOrKeyword(tokStart);
    90  
    91      // Unknown character, emit an error.
    92      return emitError(tokStart, "unexpected character");
    93  
    94    case '_':
    95      // Handle bare identifiers.
    96      return lexBareIdentifierOrKeyword(tokStart);
    97  
    98    case 0:
    99      // This may either be a nul character in the source file or may be the EOF
   100      // marker that llvm::MemoryBuffer guarantees will be there.
   101      if (curPtr - 1 == curBuffer.end())
   102        return formToken(Token::eof, tokStart);
   103  
   104      LLVM_FALLTHROUGH;
   105    case ':':
   106      return formToken(Token::colon, tokStart);
   107    case ',':
   108      return formToken(Token::comma, tokStart);
   109    case '.':
   110      return lexEllipsis(tokStart);
   111    case '(':
   112      return formToken(Token::l_paren, tokStart);
   113    case ')':
   114      return formToken(Token::r_paren, tokStart);
   115    case '{':
   116      return formToken(Token::l_brace, tokStart);
   117    case '}':
   118      return formToken(Token::r_brace, tokStart);
   119    case '[':
   120      return formToken(Token::l_square, tokStart);
   121    case ']':
   122      return formToken(Token::r_square, tokStart);
   123    case '<':
   124      return formToken(Token::less, tokStart);
   125    case '>':
   126      return formToken(Token::greater, tokStart);
   127    case '=':
   128      return formToken(Token::equal, tokStart);
   129  
   130    case '+':
   131      return formToken(Token::plus, tokStart);
   132    case '*':
   133      return formToken(Token::star, tokStart);
   134    case '-':
   135      if (*curPtr == '>') {
   136        ++curPtr;
   137        return formToken(Token::arrow, tokStart);
   138      }
   139      return formToken(Token::minus, tokStart);
   140  
   141    case '?':
   142      return formToken(Token::question, tokStart);
   143  
   144    case '/':
   145      if (*curPtr == '/')
   146        return lexComment();
   147      return emitError(tokStart, "unexpected character");
   148  
   149    case '@':
   150      return lexAtIdentifier(tokStart);
   151  
   152    case '!':
   153      LLVM_FALLTHROUGH;
   154    case '^':
   155      LLVM_FALLTHROUGH;
   156    case '#':
   157      LLVM_FALLTHROUGH;
   158    case '%':
   159      return lexPrefixedIdentifier(tokStart);
   160    case '"':
   161      return lexString(tokStart);
   162  
   163    case '0':
   164    case '1':
   165    case '2':
   166    case '3':
   167    case '4':
   168    case '5':
   169    case '6':
   170    case '7':
   171    case '8':
   172    case '9':
   173      return lexNumber(tokStart);
   174    }
   175  }
   176  
   177  /// Lex an '@foo' identifier.
   178  ///
   179  ///   symbol-ref-id ::= `@` bare-id
   180  ///
   181  Token Lexer::lexAtIdentifier(const char *tokStart) {
   182    // These always start with a letter or underscore.
   183    auto cur = *curPtr++;
   184    if (!isalpha(cur) && cur != '_')
   185      return emitError(curPtr - 1,
   186                       "@ identifier expected to start with letter or '_'");
   187  
   188    while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
   189           *curPtr == '$' || *curPtr == '.')
   190      ++curPtr;
   191    return formToken(Token::at_identifier, tokStart);
   192  }
   193  
   194  /// Lex a bare identifier or keyword that starts with a letter.
   195  ///
   196  ///   bare-id ::= (letter|[_]) (letter|digit|[_$.])*
   197  ///   integer-type ::= `i[1-9][0-9]*`
   198  ///
   199  Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
   200    // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
   201    while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
   202           *curPtr == '$' || *curPtr == '.')
   203      ++curPtr;
   204  
   205    // Check to see if this identifier is a keyword.
   206    StringRef spelling(tokStart, curPtr - tokStart);
   207  
   208    // Check for i123.
   209    if (tokStart[0] == 'i') {
   210      bool allDigits = true;
   211      for (auto c : spelling.drop_front())
   212        allDigits &= isdigit(c) != 0;
   213      if (allDigits && spelling.size() != 1)
   214        return Token(Token::inttype, spelling);
   215    }
   216  
   217    Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
   218  #define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
   219  #include "TokenKinds.def"
   220                           .Default(Token::bare_identifier);
   221  
   222    return Token(kind, spelling);
   223  }
   224  
   225  /// Lex a comment line, starting with a semicolon.
   226  ///
   227  ///   TODO: add a regex for comments here and to the spec.
   228  ///
   229  Token Lexer::lexComment() {
   230    // Advance over the second '/' in a '//' comment.
   231    assert(*curPtr == '/');
   232    ++curPtr;
   233  
   234    while (true) {
   235      switch (*curPtr++) {
   236      case '\n':
   237      case '\r':
   238        // Newline is end of comment.
   239        return lexToken();
   240      case 0:
   241        // If this is the end of the buffer, end the comment.
   242        if (curPtr - 1 == curBuffer.end()) {
   243          --curPtr;
   244          return lexToken();
   245        }
   246        LLVM_FALLTHROUGH;
   247      default:
   248        // Skip over other characters.
   249        break;
   250      }
   251    }
   252  }
   253  
   254  /// Lex an ellipsis.
   255  ///
   256  ///   ellipsis ::= '...'
   257  ///
   258  Token Lexer::lexEllipsis(const char *tokStart) {
   259    assert(curPtr[-1] == '.');
   260  
   261    if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')
   262      return emitError(curPtr, "expected three consecutive dots for an ellipsis");
   263  
   264    curPtr += 2;
   265    return formToken(Token::ellipsis, tokStart);
   266  }
   267  
   268  /// Lex a number literal.
   269  ///
   270  ///   integer-literal ::= digit+ | `0x` hex_digit+
   271  ///   float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
   272  ///
   273  Token Lexer::lexNumber(const char *tokStart) {
   274    assert(isdigit(curPtr[-1]));
   275  
   276    // Handle the hexadecimal case.
   277    if (curPtr[-1] == '0' && *curPtr == 'x') {
   278      // If we see stuff like 0xi32, this is a literal `0` follwed by an
   279      // identifier `xi32`, stop after `0`.
   280      if (!isxdigit(curPtr[1]))
   281        return formToken(Token::integer, tokStart);
   282  
   283      curPtr += 2;
   284      while (isxdigit(*curPtr))
   285        ++curPtr;
   286  
   287      return formToken(Token::integer, tokStart);
   288    }
   289  
   290    // Handle the normal decimal case.
   291    while (isdigit(*curPtr))
   292      ++curPtr;
   293  
   294    if (*curPtr != '.')
   295      return formToken(Token::integer, tokStart);
   296    ++curPtr;
   297  
   298    // Skip over [0-9]*([eE][-+]?[0-9]+)?
   299    while (isdigit(*curPtr))
   300      ++curPtr;
   301  
   302    if (*curPtr == 'e' || *curPtr == 'E') {
   303      if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
   304          ((curPtr[1] == '-' || curPtr[1] == '+') &&
   305           isdigit(static_cast<unsigned char>(curPtr[2])))) {
   306        curPtr += 2;
   307        while (isdigit(*curPtr))
   308          ++curPtr;
   309      }
   310    }
   311    return formToken(Token::floatliteral, tokStart);
   312  }
   313  
   314  /// Lex an identifier that starts with a prefix followed by suffix-id.
   315  ///
   316  ///   affine-map-id ::= `#` suffix-id
   317  ///   ssa-id        ::= '%' suffix-id
   318  ///   block-id      ::= '^' suffix-id
   319  ///   type-id       ::= '!' suffix-id
   320  ///   suffix-id     ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
   321  ///   id-punct      ::= `$` | `.` | `_` | `-`
   322  ///
   323  Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
   324    Token::Kind kind;
   325    StringRef errorKind;
   326    switch (*tokStart) {
   327    case '#':
   328      kind = Token::hash_identifier;
   329      errorKind = "invalid attribute name";
   330      break;
   331    case '%':
   332      kind = Token::percent_identifier;
   333      errorKind = "invalid SSA name";
   334      break;
   335    case '^':
   336      kind = Token::caret_identifier;
   337      errorKind = "invalid block name";
   338      break;
   339    case '!':
   340      kind = Token::exclamation_identifier;
   341      errorKind = "invalid type identifier";
   342      break;
   343    default:
   344      llvm_unreachable("invalid caller");
   345    }
   346  
   347    // Parse suffix-id.
   348    if (isdigit(*curPtr)) {
   349      // If suffix-id starts with a digit, the rest must be digits.
   350      while (isdigit(*curPtr)) {
   351        ++curPtr;
   352      }
   353    } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
   354      do {
   355        ++curPtr;
   356      } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
   357    } else {
   358      return emitError(curPtr - 1, errorKind);
   359    }
   360  
   361    return formToken(kind, tokStart);
   362  }
   363  
   364  /// Lex a string literal.
   365  ///
   366  ///   string-literal ::= '"' [^"\n\f\v\r]* '"'
   367  ///
   368  /// TODO: define escaping rules.
   369  Token Lexer::lexString(const char *tokStart) {
   370    assert(curPtr[-1] == '"');
   371  
   372    while (1) {
   373      switch (*curPtr++) {
   374      case '"':
   375        return formToken(Token::string, tokStart);
   376      case 0:
   377        // If this is a random nul character in the middle of a string, just
   378        // include it.  If it is the end of file, then it is an error.
   379        if (curPtr - 1 != curBuffer.end())
   380          continue;
   381        LLVM_FALLTHROUGH;
   382      case '\n':
   383      case '\v':
   384      case '\f':
   385        return emitError(curPtr - 1, "expected '\"' in string literal");
   386      case '\\':
   387        // Handle explicitly a few escapes.
   388        if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
   389          ++curPtr;
   390        else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
   391          // Support \xx for two hex digits.
   392          curPtr += 2;
   393        else
   394          return emitError(curPtr - 1, "unknown escape in string literal");
   395        continue;
   396  
   397      default:
   398        continue;
   399      }
   400    }
   401  }