zombiezen.com/go/lua@v0.0.0-20231013005828-290725fb9140/internal/lua54/llex.c (about)

     1  /*
     2  ** $Id: llex.c $
     3  ** Lexical Analyzer
     4  ** See Copyright Notice in lua.h
     5  */
     6  
     7  #define llex_c
     8  #define LUA_CORE
     9  
    10  #include "lprefix.h"
    11  
    12  
    13  #include <locale.h>
    14  #include <string.h>
    15  
    16  #include "lua.h"
    17  
    18  #include "lctype.h"
    19  #include "ldebug.h"
    20  #include "ldo.h"
    21  #include "lgc.h"
    22  #include "llex.h"
    23  #include "lobject.h"
    24  #include "lparser.h"
    25  #include "lstate.h"
    26  #include "lstring.h"
    27  #include "ltable.h"
    28  #include "lzio.h"
    29  
    30  
    31  
    32  #define next(ls)	(ls->current = zgetc(ls->z))
    33  
    34  
    35  
    36  #define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
    37  
    38  
    39  /* ORDER RESERVED */
    40  static const char *const luaX_tokens [] = {
    41      "and", "break", "do", "else", "elseif",
    42      "end", "false", "for", "function", "goto", "if",
    43      "in", "local", "nil", "not", "or", "repeat",
    44      "return", "then", "true", "until", "while",
    45      "//", "..", "...", "==", ">=", "<=", "~=",
    46      "<<", ">>", "::", "<eof>",
    47      "<number>", "<integer>", "<name>", "<string>"
    48  };
    49  
    50  
    51  #define save_and_next(ls) (save(ls, ls->current), next(ls))
    52  
    53  
    54  static l_noret lexerror (LexState *ls, const char *msg, int token);
    55  
    56  
    57  static void save (LexState *ls, int c) {
    58    Mbuffer *b = ls->buff;
    59    if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
    60      size_t newsize;
    61      if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
    62        lexerror(ls, "lexical element too long", 0);
    63      newsize = luaZ_sizebuffer(b) * 2;
    64      luaZ_resizebuffer(ls->L, b, newsize);
    65    }
    66    b->buffer[luaZ_bufflen(b)++] = cast_char(c);
    67  }
    68  
    69  
    70  void luaX_init (lua_State *L) {
    71    int i;
    72    TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
    73    luaC_fix(L, obj2gco(e));  /* never collect this name */
    74    for (i=0; i<NUM_RESERVED; i++) {
    75      TString *ts = luaS_new(L, luaX_tokens[i]);
    76      luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
    77      ts->extra = cast_byte(i+1);  /* reserved word */
    78    }
    79  }
    80  
    81  
    82  const char *luaX_token2str (LexState *ls, int token) {
    83    if (token < FIRST_RESERVED) {  /* single-byte symbols? */
    84      if (lisprint(token))
    85        return luaO_pushfstring(ls->L, "'%c'", token);
    86      else  /* control character */
    87        return luaO_pushfstring(ls->L, "'<\\%d>'", token);
    88    }
    89    else {
    90      const char *s = luaX_tokens[token - FIRST_RESERVED];
    91      if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
    92        return luaO_pushfstring(ls->L, "'%s'", s);
    93      else  /* names, strings, and numerals */
    94        return s;
    95    }
    96  }
    97  
    98  
    99  static const char *txtToken (LexState *ls, int token) {
   100    switch (token) {
   101      case TK_NAME: case TK_STRING:
   102      case TK_FLT: case TK_INT:
   103        save(ls, '\0');
   104        return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
   105      default:
   106        return luaX_token2str(ls, token);
   107    }
   108  }
   109  
   110  
   111  static l_noret lexerror (LexState *ls, const char *msg, int token) {
   112    msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
   113    if (token)
   114      luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
   115    luaD_throw(ls->L, LUA_ERRSYNTAX);
   116  }
   117  
   118  
   119  l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
   120    lexerror(ls, msg, ls->t.token);
   121  }
   122  
   123  
   124  /*
   125  ** Creates a new string and anchors it in scanner's table so that it
   126  ** will not be collected until the end of the compilation; by that time
   127  ** it should be anchored somewhere. It also internalizes long strings,
   128  ** ensuring there is only one copy of each unique string.  The table
   129  ** here is used as a set: the string enters as the key, while its value
   130  ** is irrelevant. We use the string itself as the value only because it
   131  ** is a TValue readily available. Later, the code generation can change
   132  ** this value.
   133  */
   134  TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
   135    lua_State *L = ls->L;
   136    TString *ts = luaS_newlstr(L, str, l);  /* create new string */
   137    const TValue *o = luaH_getstr(ls->h, ts);
   138    if (!ttisnil(o))  /* string already present? */
   139      ts = keystrval(nodefromval(o));  /* get saved copy */
   140    else {  /* not in use yet */
   141      TValue *stv = s2v(L->top.p++);  /* reserve stack space for string */
   142      setsvalue(L, stv, ts);  /* temporarily anchor the string */
   143      luaH_finishset(L, ls->h, stv, o, stv);  /* t[string] = string */
   144      /* table is not a metatable, so it does not need to invalidate cache */
   145      luaC_checkGC(L);
   146      L->top.p--;  /* remove string from stack */
   147    }
   148    return ts;
   149  }
   150  
   151  
   152  /*
   153  ** increment line number and skips newline sequence (any of
   154  ** \n, \r, \n\r, or \r\n)
   155  */
   156  static void inclinenumber (LexState *ls) {
   157    int old = ls->current;
   158    lua_assert(currIsNewline(ls));
   159    next(ls);  /* skip '\n' or '\r' */
   160    if (currIsNewline(ls) && ls->current != old)
   161      next(ls);  /* skip '\n\r' or '\r\n' */
   162    if (++ls->linenumber >= MAX_INT)
   163      lexerror(ls, "chunk has too many lines", 0);
   164  }
   165  
   166  
   167  void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
   168                      int firstchar) {
   169    ls->t.token = 0;
   170    ls->L = L;
   171    ls->current = firstchar;
   172    ls->lookahead.token = TK_EOS;  /* no look-ahead token */
   173    ls->z = z;
   174    ls->fs = NULL;
   175    ls->linenumber = 1;
   176    ls->lastline = 1;
   177    ls->source = source;
   178    ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
   179    luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
   180  }
   181  
   182  
   183  
   184  /*
   185  ** =======================================================
   186  ** LEXICAL ANALYZER
   187  ** =======================================================
   188  */
   189  
   190  
   191  static int check_next1 (LexState *ls, int c) {
   192    if (ls->current == c) {
   193      next(ls);
   194      return 1;
   195    }
   196    else return 0;
   197  }
   198  
   199  
   200  /*
   201  ** Check whether current char is in set 'set' (with two chars) and
   202  ** saves it
   203  */
   204  static int check_next2 (LexState *ls, const char *set) {
   205    lua_assert(set[2] == '\0');
   206    if (ls->current == set[0] || ls->current == set[1]) {
   207      save_and_next(ls);
   208      return 1;
   209    }
   210    else return 0;
   211  }
   212  
   213  
   214  /* LUA_NUMBER */
   215  /*
   216  ** This function is quite liberal in what it accepts, as 'luaO_str2num'
   217  ** will reject ill-formed numerals. Roughly, it accepts the following
   218  ** pattern:
   219  **
   220  **   %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))*
   221  **
   222  ** The only tricky part is to accept [+-] only after a valid exponent
   223  ** mark, to avoid reading '3-4' or '0xe+1' as a single number.
   224  **
   225  ** The caller might have already read an initial dot.
   226  */
   227  static int read_numeral (LexState *ls, SemInfo *seminfo) {
   228    TValue obj;
   229    const char *expo = "Ee";
   230    int first = ls->current;
   231    lua_assert(lisdigit(ls->current));
   232    save_and_next(ls);
   233    if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
   234      expo = "Pp";
   235    for (;;) {
   236      if (check_next2(ls, expo))  /* exponent mark? */
   237        check_next2(ls, "-+");  /* optional exponent sign */
   238      else if (lisxdigit(ls->current) || ls->current == '.')  /* '%x|%.' */
   239        save_and_next(ls);
   240      else break;
   241    }
   242    if (lislalpha(ls->current))  /* is numeral touching a letter? */
   243      save_and_next(ls);  /* force an error */
   244    save(ls, '\0');
   245    if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)  /* format error? */
   246      lexerror(ls, "malformed number", TK_FLT);
   247    if (ttisinteger(&obj)) {
   248      seminfo->i = ivalue(&obj);
   249      return TK_INT;
   250    }
   251    else {
   252      lua_assert(ttisfloat(&obj));
   253      seminfo->r = fltvalue(&obj);
   254      return TK_FLT;
   255    }
   256  }
   257  
   258  
   259  /*
   260  ** read a sequence '[=*[' or ']=*]', leaving the last bracket. If
   261  ** sequence is well formed, return its number of '='s + 2; otherwise,
   262  ** return 1 if it is a single bracket (no '='s and no 2nd bracket);
   263  ** otherwise (an unfinished '[==...') return 0.
   264  */
   265  static size_t skip_sep (LexState *ls) {
   266    size_t count = 0;
   267    int s = ls->current;
   268    lua_assert(s == '[' || s == ']');
   269    save_and_next(ls);
   270    while (ls->current == '=') {
   271      save_and_next(ls);
   272      count++;
   273    }
   274    return (ls->current == s) ? count + 2
   275           : (count == 0) ? 1
   276           : 0;
   277  }
   278  
   279  
   280  static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) {
   281    int line = ls->linenumber;  /* initial line (for error message) */
   282    save_and_next(ls);  /* skip 2nd '[' */
   283    if (currIsNewline(ls))  /* string starts with a newline? */
   284      inclinenumber(ls);  /* skip it */
   285    for (;;) {
   286      switch (ls->current) {
   287        case EOZ: {  /* error */
   288          const char *what = (seminfo ? "string" : "comment");
   289          const char *msg = luaO_pushfstring(ls->L,
   290                       "unfinished long %s (starting at line %d)", what, line);
   291          lexerror(ls, msg, TK_EOS);
   292          break;  /* to avoid warnings */
   293        }
   294        case ']': {
   295          if (skip_sep(ls) == sep) {
   296            save_and_next(ls);  /* skip 2nd ']' */
   297            goto endloop;
   298          }
   299          break;
   300        }
   301        case '\n': case '\r': {
   302          save(ls, '\n');
   303          inclinenumber(ls);
   304          if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
   305          break;
   306        }
   307        default: {
   308          if (seminfo) save_and_next(ls);
   309          else next(ls);
   310        }
   311      }
   312    } endloop:
   313    if (seminfo)
   314      seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
   315                                       luaZ_bufflen(ls->buff) - 2 * sep);
   316  }
   317  
   318  
   319  static void esccheck (LexState *ls, int c, const char *msg) {
   320    if (!c) {
   321      if (ls->current != EOZ)
   322        save_and_next(ls);  /* add current to buffer for error message */
   323      lexerror(ls, msg, TK_STRING);
   324    }
   325  }
   326  
   327  
   328  static int gethexa (LexState *ls) {
   329    save_and_next(ls);
   330    esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
   331    return luaO_hexavalue(ls->current);
   332  }
   333  
   334  
   335  static int readhexaesc (LexState *ls) {
   336    int r = gethexa(ls);
   337    r = (r << 4) + gethexa(ls);
   338    luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
   339    return r;
   340  }
   341  
   342  
   343  static unsigned long readutf8esc (LexState *ls) {
   344    unsigned long r;
   345    int i = 4;  /* chars to be removed: '\', 'u', '{', and first digit */
   346    save_and_next(ls);  /* skip 'u' */
   347    esccheck(ls, ls->current == '{', "missing '{'");
   348    r = gethexa(ls);  /* must have at least one digit */
   349    while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
   350      i++;
   351      esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
   352      r = (r << 4) + luaO_hexavalue(ls->current);
   353    }
   354    esccheck(ls, ls->current == '}', "missing '}'");
   355    next(ls);  /* skip '}' */
   356    luaZ_buffremove(ls->buff, i);  /* remove saved chars from buffer */
   357    return r;
   358  }
   359  
   360  
   361  static void utf8esc (LexState *ls) {
   362    char buff[UTF8BUFFSZ];
   363    int n = luaO_utf8esc(buff, readutf8esc(ls));
   364    for (; n > 0; n--)  /* add 'buff' to string */
   365      save(ls, buff[UTF8BUFFSZ - n]);
   366  }
   367  
   368  
   369  static int readdecesc (LexState *ls) {
   370    int i;
   371    int r = 0;  /* result accumulator */
   372    for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
   373      r = 10*r + ls->current - '0';
   374      save_and_next(ls);
   375    }
   376    esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
   377    luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
   378    return r;
   379  }
   380  
   381  
   382  static void read_string (LexState *ls, int del, SemInfo *seminfo) {
   383    save_and_next(ls);  /* keep delimiter (for error messages) */
   384    while (ls->current != del) {
   385      switch (ls->current) {
   386        case EOZ:
   387          lexerror(ls, "unfinished string", TK_EOS);
   388          break;  /* to avoid warnings */
   389        case '\n':
   390        case '\r':
   391          lexerror(ls, "unfinished string", TK_STRING);
   392          break;  /* to avoid warnings */
   393        case '\\': {  /* escape sequences */
   394          int c;  /* final character to be saved */
   395          save_and_next(ls);  /* keep '\\' for error messages */
   396          switch (ls->current) {
   397            case 'a': c = '\a'; goto read_save;
   398            case 'b': c = '\b'; goto read_save;
   399            case 'f': c = '\f'; goto read_save;
   400            case 'n': c = '\n'; goto read_save;
   401            case 'r': c = '\r'; goto read_save;
   402            case 't': c = '\t'; goto read_save;
   403            case 'v': c = '\v'; goto read_save;
   404            case 'x': c = readhexaesc(ls); goto read_save;
   405            case 'u': utf8esc(ls);  goto no_save;
   406            case '\n': case '\r':
   407              inclinenumber(ls); c = '\n'; goto only_save;
   408            case '\\': case '\"': case '\'':
   409              c = ls->current; goto read_save;
   410            case EOZ: goto no_save;  /* will raise an error next loop */
   411            case 'z': {  /* zap following span of spaces */
   412              luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
   413              next(ls);  /* skip the 'z' */
   414              while (lisspace(ls->current)) {
   415                if (currIsNewline(ls)) inclinenumber(ls);
   416                else next(ls);
   417              }
   418              goto no_save;
   419            }
   420            default: {
   421              esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
   422              c = readdecesc(ls);  /* digital escape '\ddd' */
   423              goto only_save;
   424            }
   425          }
   426         read_save:
   427           next(ls);
   428           /* go through */
   429         only_save:
   430           luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
   431           save(ls, c);
   432           /* go through */
   433         no_save: break;
   434        }
   435        default:
   436          save_and_next(ls);
   437      }
   438    }
   439    save_and_next(ls);  /* skip delimiter */
   440    seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
   441                                     luaZ_bufflen(ls->buff) - 2);
   442  }
   443  
   444  
   445  static int llex (LexState *ls, SemInfo *seminfo) {
   446    luaZ_resetbuffer(ls->buff);
   447    for (;;) {
   448      switch (ls->current) {
   449        case '\n': case '\r': {  /* line breaks */
   450          inclinenumber(ls);
   451          break;
   452        }
   453        case ' ': case '\f': case '\t': case '\v': {  /* spaces */
   454          next(ls);
   455          break;
   456        }
   457        case '-': {  /* '-' or '--' (comment) */
   458          next(ls);
   459          if (ls->current != '-') return '-';
   460          /* else is a comment */
   461          next(ls);
   462          if (ls->current == '[') {  /* long comment? */
   463            size_t sep = skip_sep(ls);
   464            luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
   465            if (sep >= 2) {
   466              read_long_string(ls, NULL, sep);  /* skip long comment */
   467              luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
   468              break;
   469            }
   470          }
   471          /* else short comment */
   472          while (!currIsNewline(ls) && ls->current != EOZ)
   473            next(ls);  /* skip until end of line (or end of file) */
   474          break;
   475        }
   476        case '[': {  /* long string or simply '[' */
   477          size_t sep = skip_sep(ls);
   478          if (sep >= 2) {
   479            read_long_string(ls, seminfo, sep);
   480            return TK_STRING;
   481          }
   482          else if (sep == 0)  /* '[=...' missing second bracket? */
   483            lexerror(ls, "invalid long string delimiter", TK_STRING);
   484          return '[';
   485        }
   486        case '=': {
   487          next(ls);
   488          if (check_next1(ls, '=')) return TK_EQ;  /* '==' */
   489          else return '=';
   490        }
   491        case '<': {
   492          next(ls);
   493          if (check_next1(ls, '=')) return TK_LE;  /* '<=' */
   494          else if (check_next1(ls, '<')) return TK_SHL;  /* '<<' */
   495          else return '<';
   496        }
   497        case '>': {
   498          next(ls);
   499          if (check_next1(ls, '=')) return TK_GE;  /* '>=' */
   500          else if (check_next1(ls, '>')) return TK_SHR;  /* '>>' */
   501          else return '>';
   502        }
   503        case '/': {
   504          next(ls);
   505          if (check_next1(ls, '/')) return TK_IDIV;  /* '//' */
   506          else return '/';
   507        }
   508        case '~': {
   509          next(ls);
   510          if (check_next1(ls, '=')) return TK_NE;  /* '~=' */
   511          else return '~';
   512        }
   513        case ':': {
   514          next(ls);
   515          if (check_next1(ls, ':')) return TK_DBCOLON;  /* '::' */
   516          else return ':';
   517        }
   518        case '"': case '\'': {  /* short literal strings */
   519          read_string(ls, ls->current, seminfo);
   520          return TK_STRING;
   521        }
   522        case '.': {  /* '.', '..', '...', or number */
   523          save_and_next(ls);
   524          if (check_next1(ls, '.')) {
   525            if (check_next1(ls, '.'))
   526              return TK_DOTS;   /* '...' */
   527            else return TK_CONCAT;   /* '..' */
   528          }
   529          else if (!lisdigit(ls->current)) return '.';
   530          else return read_numeral(ls, seminfo);
   531        }
   532        case '0': case '1': case '2': case '3': case '4':
   533        case '5': case '6': case '7': case '8': case '9': {
   534          return read_numeral(ls, seminfo);
   535        }
   536        case EOZ: {
   537          return TK_EOS;
   538        }
   539        default: {
   540          if (lislalpha(ls->current)) {  /* identifier or reserved word? */
   541            TString *ts;
   542            do {
   543              save_and_next(ls);
   544            } while (lislalnum(ls->current));
   545            ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
   546                                    luaZ_bufflen(ls->buff));
   547            seminfo->ts = ts;
   548            if (isreserved(ts))  /* reserved word? */
   549              return ts->extra - 1 + FIRST_RESERVED;
   550            else {
   551              return TK_NAME;
   552            }
   553          }
   554          else {  /* single-char tokens ('+', '*', '%', '{', '}', ...) */
   555            int c = ls->current;
   556            next(ls);
   557            return c;
   558          }
   559        }
   560      }
   561    }
   562  }
   563  
   564  
   565  void luaX_next (LexState *ls) {
   566    ls->lastline = ls->linenumber;
   567    if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
   568      ls->t = ls->lookahead;  /* use this one */
   569      ls->lookahead.token = TK_EOS;  /* and discharge it */
   570    }
   571    else
   572      ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
   573  }
   574  
   575  
   576  int luaX_lookahead (LexState *ls) {
   577    lua_assert(ls->lookahead.token == TK_EOS);
   578    ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
   579    return ls->lookahead.token;
   580  }
   581