zombiezen.com/go/lua@v0.0.0-20231013005828-290725fb9140/internal/lua54/lutf8lib.c (about)

     1  /*
     2  ** $Id: lutf8lib.c $
     3  ** Standard library for UTF-8 manipulation
     4  ** See Copyright Notice in lua.h
     5  */
     6  
     7  #define lutf8lib_c
     8  #define LUA_LIB
     9  
    10  #include "lprefix.h"
    11  
    12  
    13  #include <assert.h>
    14  #include <limits.h>
    15  #include <stdlib.h>
    16  #include <string.h>
    17  
    18  #include "lua.h"
    19  
    20  #include "lauxlib.h"
    21  #include "lualib.h"
    22  
    23  
    24  #define MAXUNICODE	0x10FFFFu
    25  
    26  #define MAXUTF		0x7FFFFFFFu
    27  
    28  
    29  #define MSGInvalid	"invalid UTF-8 code"
    30  
    31  /*
    32  ** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
    33  */
    34  #if (UINT_MAX >> 30) >= 1
    35  typedef unsigned int utfint;
    36  #else
    37  typedef unsigned long utfint;
    38  #endif
    39  
    40  
    41  #define iscont(c)	(((c) & 0xC0) == 0x80)
    42  #define iscontp(p)	iscont(*(p))
    43  
    44  
    45  /* from strlib */
    46  /* translate a relative string position: negative means back from end */
    47  static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
    48    if (pos >= 0) return pos;
    49    else if (0u - (size_t)pos > len) return 0;
    50    else return (lua_Integer)len + pos + 1;
    51  }
    52  
    53  
    54  /*
    55  ** Decode one UTF-8 sequence, returning NULL if byte sequence is
    56  ** invalid.  The array 'limits' stores the minimum value for each
    57  ** sequence length, to check for overlong representations. Its first
    58  ** entry forces an error for non-ascii bytes with no continuation
    59  ** bytes (count == 0).
    60  */
    61  static const char *utf8_decode (const char *s, utfint *val, int strict) {
    62    static const utfint limits[] =
    63          {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
    64    unsigned int c = (unsigned char)s[0];
    65    utfint res = 0;  /* final result */
    66    if (c < 0x80)  /* ascii? */
    67      res = c;
    68    else {
    69      int count = 0;  /* to count number of continuation bytes */
    70      for (; c & 0x40; c <<= 1) {  /* while it needs continuation bytes... */
    71        unsigned int cc = (unsigned char)s[++count];  /* read next byte */
    72        if (!iscont(cc))  /* not a continuation byte? */
    73          return NULL;  /* invalid byte sequence */
    74        res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
    75      }
    76      res |= ((utfint)(c & 0x7F) << (count * 5));  /* add first byte */
    77      if (count > 5 || res > MAXUTF || res < limits[count])
    78        return NULL;  /* invalid byte sequence */
    79      s += count;  /* skip continuation bytes read */
    80    }
    81    if (strict) {
    82      /* check for invalid code points; too large or surrogates */
    83      if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu))
    84        return NULL;
    85    }
    86    if (val) *val = res;
    87    return s + 1;  /* +1 to include first byte */
    88  }
    89  
    90  
    91  /*
    92  ** utf8len(s [, i [, j [, lax]]]) --> number of characters that
    93  ** start in the range [i,j], or nil + current position if 's' is not
    94  ** well formed in that interval
    95  */
    96  static int utflen (lua_State *L) {
    97    lua_Integer n = 0;  /* counter for the number of characters */
    98    size_t len;  /* string length in bytes */
    99    const char *s = luaL_checklstring(L, 1, &len);
   100    lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
   101    lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
   102    int lax = lua_toboolean(L, 4);
   103    luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
   104                     "initial position out of bounds");
   105    luaL_argcheck(L, --posj < (lua_Integer)len, 3,
   106                     "final position out of bounds");
   107    while (posi <= posj) {
   108      const char *s1 = utf8_decode(s + posi, NULL, !lax);
   109      if (s1 == NULL) {  /* conversion error? */
   110        luaL_pushfail(L);  /* return fail ... */
   111        lua_pushinteger(L, posi + 1);  /* ... and current position */
   112        return 2;
   113      }
   114      posi = s1 - s;
   115      n++;
   116    }
   117    lua_pushinteger(L, n);
   118    return 1;
   119  }
   120  
   121  
   122  /*
   123  ** codepoint(s, [i, [j [, lax]]]) -> returns codepoints for all
   124  ** characters that start in the range [i,j]
   125  */
   126  static int codepoint (lua_State *L) {
   127    size_t len;
   128    const char *s = luaL_checklstring(L, 1, &len);
   129    lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
   130    lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
   131    int lax = lua_toboolean(L, 4);
   132    int n;
   133    const char *se;
   134    luaL_argcheck(L, posi >= 1, 2, "out of bounds");
   135    luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds");
   136    if (posi > pose) return 0;  /* empty interval; return no values */
   137    if (pose - posi >= INT_MAX)  /* (lua_Integer -> int) overflow? */
   138      return luaL_error(L, "string slice too long");
   139    n = (int)(pose -  posi) + 1;  /* upper bound for number of returns */
   140    luaL_checkstack(L, n, "string slice too long");
   141    n = 0;  /* count the number of returns */
   142    se = s + pose;  /* string end */
   143    for (s += posi - 1; s < se;) {
   144      utfint code;
   145      s = utf8_decode(s, &code, !lax);
   146      if (s == NULL)
   147        return luaL_error(L, MSGInvalid);
   148      lua_pushinteger(L, code);
   149      n++;
   150    }
   151    return n;
   152  }
   153  
   154  
   155  static void pushutfchar (lua_State *L, int arg) {
   156    lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
   157    luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
   158    lua_pushfstring(L, "%U", (long)code);
   159  }
   160  
   161  
   162  /*
   163  ** utfchar(n1, n2, ...)  -> char(n1)..char(n2)...
   164  */
   165  static int utfchar (lua_State *L) {
   166    int n = lua_gettop(L);  /* number of arguments */
   167    if (n == 1)  /* optimize common case of single char */
   168      pushutfchar(L, 1);
   169    else {
   170      int i;
   171      luaL_Buffer b;
   172      luaL_buffinit(L, &b);
   173      for (i = 1; i <= n; i++) {
   174        pushutfchar(L, i);
   175        luaL_addvalue(&b);
   176      }
   177      luaL_pushresult(&b);
   178    }
   179    return 1;
   180  }
   181  
   182  
   183  /*
   184  ** offset(s, n, [i])  -> index where n-th character counting from
   185  **   position 'i' starts; 0 means character at 'i'.
   186  */
   187  static int byteoffset (lua_State *L) {
   188    size_t len;
   189    const char *s = luaL_checklstring(L, 1, &len);
   190    lua_Integer n  = luaL_checkinteger(L, 2);
   191    lua_Integer posi = (n >= 0) ? 1 : len + 1;
   192    posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
   193    luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
   194                     "position out of bounds");
   195    if (n == 0) {
   196      /* find beginning of current byte sequence */
   197      while (posi > 0 && iscontp(s + posi)) posi--;
   198    }
   199    else {
   200      if (iscontp(s + posi))
   201        return luaL_error(L, "initial position is a continuation byte");
   202      if (n < 0) {
   203         while (n < 0 && posi > 0) {  /* move back */
   204           do {  /* find beginning of previous character */
   205             posi--;
   206           } while (posi > 0 && iscontp(s + posi));
   207           n++;
   208         }
   209       }
   210       else {
   211         n--;  /* do not move for 1st character */
   212         while (n > 0 && posi < (lua_Integer)len) {
   213           do {  /* find beginning of next character */
   214             posi++;
   215           } while (iscontp(s + posi));  /* (cannot pass final '\0') */
   216           n--;
   217         }
   218       }
   219    }
   220    if (n == 0)  /* did it find given character? */
   221      lua_pushinteger(L, posi + 1);
   222    else  /* no such character */
   223      luaL_pushfail(L);
   224    return 1;
   225  }
   226  
   227  
   228  static int iter_aux (lua_State *L, int strict) {
   229    size_t len;
   230    const char *s = luaL_checklstring(L, 1, &len);
   231    lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2);
   232    if (n < len) {
   233      while (iscontp(s + n)) n++;  /* go to next character */
   234    }
   235    if (n >= len)  /* (also handles original 'n' being negative) */
   236      return 0;  /* no more codepoints */
   237    else {
   238      utfint code;
   239      const char *next = utf8_decode(s + n, &code, strict);
   240      if (next == NULL || iscontp(next))
   241        return luaL_error(L, MSGInvalid);
   242      lua_pushinteger(L, n + 1);
   243      lua_pushinteger(L, code);
   244      return 2;
   245    }
   246  }
   247  
   248  
   249  static int iter_auxstrict (lua_State *L) {
   250    return iter_aux(L, 1);
   251  }
   252  
   253  static int iter_auxlax (lua_State *L) {
   254    return iter_aux(L, 0);
   255  }
   256  
   257  
   258  static int iter_codes (lua_State *L) {
   259    int lax = lua_toboolean(L, 2);
   260    const char *s = luaL_checkstring(L, 1);
   261    luaL_argcheck(L, !iscontp(s), 1, MSGInvalid);
   262    lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
   263    lua_pushvalue(L, 1);
   264    lua_pushinteger(L, 0);
   265    return 3;
   266  }
   267  
   268  
   269  /* pattern to match a single UTF-8 character */
   270  #define UTF8PATT	"[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
   271  
   272  
   273  static const luaL_Reg funcs[] = {
   274    {"offset", byteoffset},
   275    {"codepoint", codepoint},
   276    {"char", utfchar},
   277    {"len", utflen},
   278    {"codes", iter_codes},
   279    /* placeholders */
   280    {"charpattern", NULL},
   281    {NULL, NULL}
   282  };
   283  
   284  
   285  LUAMOD_API int luaopen_utf8 (lua_State *L) {
   286    luaL_newlib(L, funcs);
   287    lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1);
   288    lua_setfield(L, -2, "charpattern");
   289    return 1;
   290  }
   291