github.com/aergoio/aergo@v1.3.1/contract/lutf8lib.c (about)

     1  /*
     2  ** $Id: lutf8lib.c,v 1.16.1.1 2017/04/19 17:29:57 roberto Exp $
     3  ** Standard library for UTF-8 manipulation
     4  ** See Copyright Notice in lua.h
     5  */
     6  
     7  #define lutf8lib_c
     8  #define LUA_LIB
     9  
    10  //#include "lprefix.h"
    11  
    12  
    13  #include <assert.h>
    14  #include <limits.h>
    15  #include <stdlib.h>
    16  #include <string.h>
    17  
    18  #include "lua.h"
    19  
    20  #include "lauxlib.h"
    21  #include "lualib.h"
    22  #include "util.h"
    23  
    24  #define MAXUNICODE	0x10FFFF
    25  
    26  #define iscont(p)	((*(p) & 0xC0) == 0x80)
    27  
    28  
    29  /* from strlib */
    30  /* translate a relative string position: negative means back from end */
    31  static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
    32    if (pos >= 0) return pos;
    33    else if (0u - (size_t)pos > len) return 0;
    34    else return (lua_Integer)len + pos + 1;
    35  }
    36  
    37  
    38  /*
    39  ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
    40  */
    41  static const char *utf8_decode (const char *o, int *val) {
    42    static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
    43    const unsigned char *s = (const unsigned char *)o;
    44    unsigned int c = s[0];
    45    unsigned int res = 0;  /* final result */
    46    if (c < 0x80)  /* ascii? */
    47      res = c;
    48    else {
    49      int count = 0;  /* to count number of continuation bytes */
    50      while (c & 0x40) {  /* still have continuation bytes? */
    51        int cc = s[++count];  /* read next byte */
    52        if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
    53          return NULL;  /* invalid byte sequence */
    54        res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
    55        c <<= 1;  /* to test next bit */
    56      }
    57      res |= ((c & 0x7F) << (count * 5));  /* add first byte */
    58      if (count > 3 || res > MAXUNICODE || res <= limits[count])
    59        return NULL;  /* invalid byte sequence */
    60      s += count;  /* skip continuation bytes read */
    61    }
    62    if (val) *val = res;
    63    return (const char *)s + 1;  /* +1 to include first byte */
    64  }
    65  
    66  
    67  /*
    68  ** utf8len(s [, i [, j]]) --> number of characters that start in the
    69  ** range [i,j], or nil + current position if 's' is not well formed in
    70  ** that interval
    71  */
    72  static int utflen (lua_State *L) {
    73    int n = 0;
    74    size_t len;
    75    const char *s = luaL_checklstring(L, 1, &len);
    76    lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
    77    lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
    78    luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
    79                     "initial position out of string");
    80    luaL_argcheck(L, --posj < (lua_Integer)len, 3,
    81                     "final position out of string");
    82    while (posi <= posj) {
    83      const char *s1 = utf8_decode(s + posi, NULL);
    84      if (s1 == NULL) {  /* conversion error? */
    85        lua_pushnil(L);  /* return nil ... */
    86        lua_pushinteger(L, posi + 1);  /* ... and current position */
    87        return 2;
    88      }
    89      posi = s1 - s;
    90      n++;
    91    }
    92    lua_pushinteger(L, n);
    93    return 1;
    94  }
    95  
    96  
    97  /*
    98  ** codepoint(s, [i, [j]])  -> returns codepoints for all characters
    99  ** that start in the range [i,j]
   100  */
   101  static int codepoint (lua_State *L) {
   102    size_t len;
   103    const char *s = luaL_checklstring(L, 1, &len);
   104    lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
   105    lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
   106    int n;
   107    const char *se;
   108    luaL_argcheck(L, posi >= 1, 2, "out of range");
   109    luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
   110    if (posi > pose) return 0;  /* empty interval; return no values */
   111    if (pose - posi >= INT_MAX)  /* (lua_Integer -> int) overflow? */
   112      return luaL_error(L, "string slice too long");
   113    n = (int)(pose -  posi) + 1;
   114    luaL_checkstack(L, n, "string slice too long");
   115    n = 0;
   116    se = s + pose;
   117    for (s += posi - 1; s < se;) {
   118      int code;
   119      s = utf8_decode(s, &code);
   120      if (s == NULL)
   121        return luaL_error(L, "invalid UTF-8 code");
   122      lua_pushinteger(L, code);
   123      n++;
   124    }
   125    return n;
   126  }
   127  
   128  
   129  static void pushutfchar (lua_State *L, int arg) {
   130    char buf[UTF8_MAX];
   131    int size;
   132  
   133    lua_Integer code = luaL_checkinteger(L, arg);
   134    luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
   135    size = lua_util_utf8_encode(buf, code);
   136    lua_pushlstring(L, buf, size);
   137  }
   138  
   139  
   140  /*
   141  ** utfchar(n1, n2, ...)  -> char(n1)..char(n2)...
   142  */
   143  static int utfchar (lua_State *L) {
   144    int n = lua_gettop(L);  /* number of arguments */
   145    if (n == 1)  /* optimize common case of single char */
   146      pushutfchar(L, 1);
   147    else {
   148      int i;
   149      luaL_Buffer b;
   150      luaL_buffinit(L, &b);
   151      for (i = 1; i <= n; i++) {
   152        pushutfchar(L, i);
   153        luaL_addvalue(&b);
   154      }
   155      luaL_pushresult(&b);
   156    }
   157    return 1;
   158  }
   159  
   160  
   161  /*
   162  ** offset(s, n, [i])  -> index where n-th character counting from
   163  **   position 'i' starts; 0 means character at 'i'.
   164  */
   165  static int byteoffset (lua_State *L) {
   166    size_t len;
   167    const char *s = luaL_checklstring(L, 1, &len);
   168    lua_Integer n  = luaL_checkinteger(L, 2);
   169    lua_Integer posi = (n >= 0) ? 1 : len + 1;
   170    posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
   171    luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
   172                     "position out of range");
   173    if (n == 0) {
   174      /* find beginning of current byte sequence */
   175      while (posi > 0 && iscont(s + posi)) posi--;
   176    }
   177    else {
   178      if (iscont(s + posi))
   179        return luaL_error(L, "initial position is a continuation byte");
   180      if (n < 0) {
   181         while (n < 0 && posi > 0) {  /* move back */
   182           do {  /* find beginning of previous character */
   183             posi--;
   184           } while (posi > 0 && iscont(s + posi));
   185           n++;
   186         }
   187       }
   188       else {
   189         n--;  /* do not move for 1st character */
   190         while (n > 0 && posi < (lua_Integer)len) {
   191           do {  /* find beginning of next character */
   192             posi++;
   193           } while (iscont(s + posi));  /* (cannot pass final '\0') */
   194           n--;
   195         }
   196       }
   197    }
   198    if (n == 0)  /* did it find given character? */
   199      lua_pushinteger(L, posi + 1);
   200    else  /* no such character */
   201      lua_pushnil(L);
   202    return 1;
   203  }
   204  
   205  
   206  static int iter_aux (lua_State *L) {
   207    size_t len;
   208    const char *s = luaL_checklstring(L, 1, &len);
   209    lua_Integer n = lua_tointeger(L, 2) - 1;
   210    if (n < 0)  /* first iteration? */
   211      n = 0;  /* start from here */
   212    else if (n < (lua_Integer)len) {
   213      n++;  /* skip current byte */
   214      while (iscont(s + n)) n++;  /* and its continuations */
   215    }
   216    if (n >= (lua_Integer)len)
   217      return 0;  /* no more codepoints */
   218    else {
   219      int code;
   220      const char *next = utf8_decode(s + n, &code);
   221      if (next == NULL || iscont(next))
   222        return luaL_error(L, "invalid UTF-8 code");
   223      lua_pushinteger(L, n + 1);
   224      lua_pushinteger(L, code);
   225      return 2;
   226    }
   227  }
   228  
   229  
   230  static int iter_codes (lua_State *L) {
   231    luaL_checkstring(L, 1);
   232    lua_pushcfunction(L, iter_aux);
   233    lua_pushvalue(L, 1);
   234    lua_pushinteger(L, 0);
   235    return 3;
   236  }
   237  
   238  
   239  /* pattern to match a single UTF-8 character */
   240  //#define UTF8PATT	"[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
   241  
   242  
   243  static const luaL_Reg funcs[] = {
   244    {"offset", byteoffset},
   245    {"codepoint", codepoint},
   246    {"char", utfchar},
   247    {"len", utflen},
   248    {"codes", iter_codes},
   249    {NULL, NULL}
   250  };
   251  
   252  
   253  int luaopen_utf8 (lua_State *L) {
   254    luaL_register(L, "utf8", funcs);
   255  //  luaL_newlib(L, funcs);
   256    lua_pop(L, 1);
   257    return 1;
   258  }