modernc.org/cc@v1.0.1/v2/testdata/_sqlite/src/utf.c (about)

     1  /*
     2  ** 2004 April 13
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  *************************************************************************
    12  ** This file contains routines used to translate between UTF-8, 
    13  ** UTF-16, UTF-16BE, and UTF-16LE.
    14  **
    15  ** Notes on UTF-8:
    16  **
    17  **   Byte-0    Byte-1    Byte-2    Byte-3    Value
    18  **  0xxxxxxx                                 00000000 00000000 0xxxxxxx
    19  **  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
    20  **  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
    21  **  11110uuu  10uuzzzz  10yyyyyy  10xxxxxx   000uuuuu zzzzyyyy yyxxxxxx
    22  **
    23  **
    24  ** Notes on UTF-16:  (with wwww+1==uuuuu)
    25  **
    26  **      Word-0               Word-1          Value
    27  **  110110ww wwzzzzyy   110111yy yyxxxxxx    000uuuuu zzzzyyyy yyxxxxxx
    28  **  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx
    29  **
    30  **
    31  ** BOM or Byte Order Mark:
    32  **     0xff 0xfe   little-endian utf-16 follows
    33  **     0xfe 0xff   big-endian utf-16 follows
    34  **
    35  */
    36  #include "sqliteInt.h"
    37  #include <assert.h>
    38  #include "vdbeInt.h"
    39  
    40  #if !defined(SQLITE_AMALGAMATION) && SQLITE_BYTEORDER==0
    41  /*
    42  ** The following constant value is used by the SQLITE_BIGENDIAN and
    43  ** SQLITE_LITTLEENDIAN macros.
    44  */
    45  const int sqlite3one = 1;
    46  #endif /* SQLITE_AMALGAMATION && SQLITE_BYTEORDER==0 */
    47  
    48  /*
    49  ** This lookup table is used to help decode the first byte of
    50  ** a multi-byte UTF8 character.
    51  */
    52  static const unsigned char sqlite3Utf8Trans1[] = {
    53    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    54    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    55    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
    56    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    57    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    58    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    59    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    60    0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
    61  };
    62  
    63  
    64  #define WRITE_UTF8(zOut, c) {                          \
    65    if( c<0x00080 ){                                     \
    66      *zOut++ = (u8)(c&0xFF);                            \
    67    }                                                    \
    68    else if( c<0x00800 ){                                \
    69      *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
    70      *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
    71    }                                                    \
    72    else if( c<0x10000 ){                                \
    73      *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
    74      *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
    75      *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
    76    }else{                                               \
    77      *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
    78      *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
    79      *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
    80      *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
    81    }                                                    \
    82  }
    83  
    84  #define WRITE_UTF16LE(zOut, c) {                                    \
    85    if( c<=0xFFFF ){                                                  \
    86      *zOut++ = (u8)(c&0x00FF);                                       \
    87      *zOut++ = (u8)((c>>8)&0x00FF);                                  \
    88    }else{                                                            \
    89      *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
    90      *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
    91      *zOut++ = (u8)(c&0x00FF);                                       \
    92      *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
    93    }                                                                 \
    94  }
    95  
    96  #define WRITE_UTF16BE(zOut, c) {                                    \
    97    if( c<=0xFFFF ){                                                  \
    98      *zOut++ = (u8)((c>>8)&0x00FF);                                  \
    99      *zOut++ = (u8)(c&0x00FF);                                       \
   100    }else{                                                            \
   101      *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
   102      *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
   103      *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
   104      *zOut++ = (u8)(c&0x00FF);                                       \
   105    }                                                                 \
   106  }
   107  
   108  #define READ_UTF16LE(zIn, TERM, c){                                   \
   109    c = (*zIn++);                                                       \
   110    c += ((*zIn++)<<8);                                                 \
   111    if( c>=0xD800 && c<0xE000 && TERM ){                                \
   112      int c2 = (*zIn++);                                                \
   113      c2 += ((*zIn++)<<8);                                              \
   114      c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
   115    }                                                                   \
   116  }
   117  
   118  #define READ_UTF16BE(zIn, TERM, c){                                   \
   119    c = ((*zIn++)<<8);                                                  \
   120    c += (*zIn++);                                                      \
   121    if( c>=0xD800 && c<0xE000 && TERM ){                                \
   122      int c2 = ((*zIn++)<<8);                                           \
   123      c2 += (*zIn++);                                                   \
   124      c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
   125    }                                                                   \
   126  }
   127  
   128  /*
   129  ** Translate a single UTF-8 character.  Return the unicode value.
   130  **
   131  ** During translation, assume that the byte that zTerm points
   132  ** is a 0x00.
   133  **
   134  ** Write a pointer to the next unread byte back into *pzNext.
   135  **
   136  ** Notes On Invalid UTF-8:
   137  **
   138  **  *  This routine never allows a 7-bit character (0x00 through 0x7f) to
   139  **     be encoded as a multi-byte character.  Any multi-byte character that
   140  **     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
   141  **
   142  **  *  This routine never allows a UTF16 surrogate value to be encoded.
   143  **     If a multi-byte character attempts to encode a value between
   144  **     0xd800 and 0xe000 then it is rendered as 0xfffd.
   145  **
   146  **  *  Bytes in the range of 0x80 through 0xbf which occur as the first
   147  **     byte of a character are interpreted as single-byte characters
   148  **     and rendered as themselves even though they are technically
   149  **     invalid characters.
   150  **
   151  **  *  This routine accepts over-length UTF8 encodings
   152  **     for unicode values 0x80 and greater.  It does not change over-length
   153  **     encodings to 0xfffd as some systems recommend.
   154  */
   155  #define READ_UTF8(zIn, zTerm, c)                           \
   156    c = *(zIn++);                                            \
   157    if( c>=0xc0 ){                                           \
   158      c = sqlite3Utf8Trans1[c-0xc0];                         \
   159      while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
   160        c = (c<<6) + (0x3f & *(zIn++));                      \
   161      }                                                      \
   162      if( c<0x80                                             \
   163          || (c&0xFFFFF800)==0xD800                          \
   164          || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
   165    }
   166  u32 sqlite3Utf8Read(
   167    const unsigned char **pz    /* Pointer to string from which to read char */
   168  ){
   169    unsigned int c;
   170  
   171    /* Same as READ_UTF8() above but without the zTerm parameter.
   172    ** For this routine, we assume the UTF8 string is always zero-terminated.
   173    */
   174    c = *((*pz)++);
   175    if( c>=0xc0 ){
   176      c = sqlite3Utf8Trans1[c-0xc0];
   177      while( (*(*pz) & 0xc0)==0x80 ){
   178        c = (c<<6) + (0x3f & *((*pz)++));
   179      }
   180      if( c<0x80
   181          || (c&0xFFFFF800)==0xD800
   182          || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }
   183    }
   184    return c;
   185  }
   186  
   187  
   188  
   189  
   190  /*
   191  ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
   192  ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
   193  */ 
   194  /* #define TRANSLATE_TRACE 1 */
   195  
   196  #ifndef SQLITE_OMIT_UTF16
   197  /*
   198  ** This routine transforms the internal text encoding used by pMem to
   199  ** desiredEnc. It is an error if the string is already of the desired
   200  ** encoding, or if *pMem does not contain a string value.
   201  */
   202  SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
   203    int len;                    /* Maximum length of output string in bytes */
   204    unsigned char *zOut;                  /* Output buffer */
   205    unsigned char *zIn;                   /* Input iterator */
   206    unsigned char *zTerm;                 /* End of input */
   207    unsigned char *z;                     /* Output iterator */
   208    unsigned int c;
   209  
   210    assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
   211    assert( pMem->flags&MEM_Str );
   212    assert( pMem->enc!=desiredEnc );
   213    assert( pMem->enc!=0 );
   214    assert( pMem->n>=0 );
   215  
   216  #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
   217    {
   218      char zBuf[100];
   219      sqlite3VdbeMemPrettyPrint(pMem, zBuf);
   220      fprintf(stderr, "INPUT:  %s\n", zBuf);
   221    }
   222  #endif
   223  
   224    /* If the translation is between UTF-16 little and big endian, then 
   225    ** all that is required is to swap the byte order. This case is handled
   226    ** differently from the others.
   227    */
   228    if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
   229      u8 temp;
   230      int rc;
   231      rc = sqlite3VdbeMemMakeWriteable(pMem);
   232      if( rc!=SQLITE_OK ){
   233        assert( rc==SQLITE_NOMEM );
   234        return SQLITE_NOMEM_BKPT;
   235      }
   236      zIn = (u8*)pMem->z;
   237      zTerm = &zIn[pMem->n&~1];
   238      while( zIn<zTerm ){
   239        temp = *zIn;
   240        *zIn = *(zIn+1);
   241        zIn++;
   242        *zIn++ = temp;
   243      }
   244      pMem->enc = desiredEnc;
   245      goto translate_out;
   246    }
   247  
   248    /* Set len to the maximum number of bytes required in the output buffer. */
   249    if( desiredEnc==SQLITE_UTF8 ){
   250      /* When converting from UTF-16, the maximum growth results from
   251      ** translating a 2-byte character to a 4-byte UTF-8 character.
   252      ** A single byte is required for the output string
   253      ** nul-terminator.
   254      */
   255      pMem->n &= ~1;
   256      len = pMem->n * 2 + 1;
   257    }else{
   258      /* When converting from UTF-8 to UTF-16 the maximum growth is caused
   259      ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
   260      ** character. Two bytes are required in the output buffer for the
   261      ** nul-terminator.
   262      */
   263      len = pMem->n * 2 + 2;
   264    }
   265  
   266    /* Set zIn to point at the start of the input buffer and zTerm to point 1
   267    ** byte past the end.
   268    **
   269    ** Variable zOut is set to point at the output buffer, space obtained
   270    ** from sqlite3_malloc().
   271    */
   272    zIn = (u8*)pMem->z;
   273    zTerm = &zIn[pMem->n];
   274    zOut = sqlite3DbMallocRaw(pMem->db, len);
   275    if( !zOut ){
   276      return SQLITE_NOMEM_BKPT;
   277    }
   278    z = zOut;
   279  
   280    if( pMem->enc==SQLITE_UTF8 ){
   281      if( desiredEnc==SQLITE_UTF16LE ){
   282        /* UTF-8 -> UTF-16 Little-endian */
   283        while( zIn<zTerm ){
   284          READ_UTF8(zIn, zTerm, c);
   285          WRITE_UTF16LE(z, c);
   286        }
   287      }else{
   288        assert( desiredEnc==SQLITE_UTF16BE );
   289        /* UTF-8 -> UTF-16 Big-endian */
   290        while( zIn<zTerm ){
   291          READ_UTF8(zIn, zTerm, c);
   292          WRITE_UTF16BE(z, c);
   293        }
   294      }
   295      pMem->n = (int)(z - zOut);
   296      *z++ = 0;
   297    }else{
   298      assert( desiredEnc==SQLITE_UTF8 );
   299      if( pMem->enc==SQLITE_UTF16LE ){
   300        /* UTF-16 Little-endian -> UTF-8 */
   301        while( zIn<zTerm ){
   302          READ_UTF16LE(zIn, zIn<zTerm, c); 
   303          WRITE_UTF8(z, c);
   304        }
   305      }else{
   306        /* UTF-16 Big-endian -> UTF-8 */
   307        while( zIn<zTerm ){
   308          READ_UTF16BE(zIn, zIn<zTerm, c); 
   309          WRITE_UTF8(z, c);
   310        }
   311      }
   312      pMem->n = (int)(z - zOut);
   313    }
   314    *z = 0;
   315    assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
   316  
   317    c = pMem->flags;
   318    sqlite3VdbeMemRelease(pMem);
   319    pMem->flags = MEM_Str|MEM_Term|(c&(MEM_AffMask|MEM_Subtype));
   320    pMem->enc = desiredEnc;
   321    pMem->z = (char*)zOut;
   322    pMem->zMalloc = pMem->z;
   323    pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z);
   324  
   325  translate_out:
   326  #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
   327    {
   328      char zBuf[100];
   329      sqlite3VdbeMemPrettyPrint(pMem, zBuf);
   330      fprintf(stderr, "OUTPUT: %s\n", zBuf);
   331    }
   332  #endif
   333    return SQLITE_OK;
   334  }
   335  #endif /* SQLITE_OMIT_UTF16 */
   336  
   337  #ifndef SQLITE_OMIT_UTF16
   338  /*
   339  ** This routine checks for a byte-order mark at the beginning of the 
   340  ** UTF-16 string stored in *pMem. If one is present, it is removed and
   341  ** the encoding of the Mem adjusted. This routine does not do any
   342  ** byte-swapping, it just sets Mem.enc appropriately.
   343  **
   344  ** The allocation (static, dynamic etc.) and encoding of the Mem may be
   345  ** changed by this function.
   346  */
   347  int sqlite3VdbeMemHandleBom(Mem *pMem){
   348    int rc = SQLITE_OK;
   349    u8 bom = 0;
   350  
   351    assert( pMem->n>=0 );
   352    if( pMem->n>1 ){
   353      u8 b1 = *(u8 *)pMem->z;
   354      u8 b2 = *(((u8 *)pMem->z) + 1);
   355      if( b1==0xFE && b2==0xFF ){
   356        bom = SQLITE_UTF16BE;
   357      }
   358      if( b1==0xFF && b2==0xFE ){
   359        bom = SQLITE_UTF16LE;
   360      }
   361    }
   362    
   363    if( bom ){
   364      rc = sqlite3VdbeMemMakeWriteable(pMem);
   365      if( rc==SQLITE_OK ){
   366        pMem->n -= 2;
   367        memmove(pMem->z, &pMem->z[2], pMem->n);
   368        pMem->z[pMem->n] = '\0';
   369        pMem->z[pMem->n+1] = '\0';
   370        pMem->flags |= MEM_Term;
   371        pMem->enc = bom;
   372      }
   373    }
   374    return rc;
   375  }
   376  #endif /* SQLITE_OMIT_UTF16 */
   377  
   378  /*
   379  ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
   380  ** return the number of unicode characters in pZ up to (but not including)
   381  ** the first 0x00 byte. If nByte is not less than zero, return the
   382  ** number of unicode characters in the first nByte of pZ (or up to 
   383  ** the first 0x00, whichever comes first).
   384  */
   385  int sqlite3Utf8CharLen(const char *zIn, int nByte){
   386    int r = 0;
   387    const u8 *z = (const u8*)zIn;
   388    const u8 *zTerm;
   389    if( nByte>=0 ){
   390      zTerm = &z[nByte];
   391    }else{
   392      zTerm = (const u8*)(-1);
   393    }
   394    assert( z<=zTerm );
   395    while( *z!=0 && z<zTerm ){
   396      SQLITE_SKIP_UTF8(z);
   397      r++;
   398    }
   399    return r;
   400  }
   401  
   402  /* This test function is not currently used by the automated test-suite. 
   403  ** Hence it is only available in debug builds.
   404  */
   405  #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
   406  /*
   407  ** Translate UTF-8 to UTF-8.
   408  **
   409  ** This has the effect of making sure that the string is well-formed
   410  ** UTF-8.  Miscoded characters are removed.
   411  **
   412  ** The translation is done in-place and aborted if the output
   413  ** overruns the input.
   414  */
   415  int sqlite3Utf8To8(unsigned char *zIn){
   416    unsigned char *zOut = zIn;
   417    unsigned char *zStart = zIn;
   418    u32 c;
   419  
   420    while( zIn[0] && zOut<=zIn ){
   421      c = sqlite3Utf8Read((const u8**)&zIn);
   422      if( c!=0xfffd ){
   423        WRITE_UTF8(zOut, c);
   424      }
   425    }
   426    *zOut = 0;
   427    return (int)(zOut - zStart);
   428  }
   429  #endif
   430  
   431  #ifndef SQLITE_OMIT_UTF16
   432  /*
   433  ** Convert a UTF-16 string in the native encoding into a UTF-8 string.
   434  ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
   435  ** be freed by the calling function.
   436  **
   437  ** NULL is returned if there is an allocation error.
   438  */
   439  char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){
   440    Mem m;
   441    memset(&m, 0, sizeof(m));
   442    m.db = db;
   443    sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC);
   444    sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
   445    if( db->mallocFailed ){
   446      sqlite3VdbeMemRelease(&m);
   447      m.z = 0;
   448    }
   449    assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
   450    assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
   451    assert( m.z || db->mallocFailed );
   452    return m.z;
   453  }
   454  
   455  /*
   456  ** zIn is a UTF-16 encoded unicode string at least nChar characters long.
   457  ** Return the number of bytes in the first nChar unicode characters
   458  ** in pZ.  nChar must be non-negative.
   459  */
   460  int sqlite3Utf16ByteLen(const void *zIn, int nChar){
   461    int c;
   462    unsigned char const *z = zIn;
   463    int n = 0;
   464    
   465    if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
   466      while( n<nChar ){
   467        READ_UTF16BE(z, 1, c);
   468        n++;
   469      }
   470    }else{
   471      while( n<nChar ){
   472        READ_UTF16LE(z, 1, c);
   473        n++;
   474      }
   475    }
   476    return (int)(z-(unsigned char const *)zIn);
   477  }
   478  
   479  #if defined(SQLITE_TEST)
   480  /*
   481  ** This routine is called from the TCL test function "translate_selftest".
   482  ** It checks that the primitives for serializing and deserializing
   483  ** characters in each encoding are inverses of each other.
   484  */
   485  void sqlite3UtfSelfTest(void){
   486    unsigned int i, t;
   487    unsigned char zBuf[20];
   488    unsigned char *z;
   489    int n;
   490    unsigned int c;
   491  
   492    for(i=0; i<0x00110000; i++){
   493      z = zBuf;
   494      WRITE_UTF8(z, i);
   495      n = (int)(z-zBuf);
   496      assert( n>0 && n<=4 );
   497      z[0] = 0;
   498      z = zBuf;
   499      c = sqlite3Utf8Read((const u8**)&z);
   500      t = i;
   501      if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
   502      if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
   503      assert( c==t );
   504      assert( (z-zBuf)==n );
   505    }
   506    for(i=0; i<0x00110000; i++){
   507      if( i>=0xD800 && i<0xE000 ) continue;
   508      z = zBuf;
   509      WRITE_UTF16LE(z, i);
   510      n = (int)(z-zBuf);
   511      assert( n>0 && n<=4 );
   512      z[0] = 0;
   513      z = zBuf;
   514      READ_UTF16LE(z, 1, c);
   515      assert( c==i );
   516      assert( (z-zBuf)==n );
   517    }
   518    for(i=0; i<0x00110000; i++){
   519      if( i>=0xD800 && i<0xE000 ) continue;
   520      z = zBuf;
   521      WRITE_UTF16BE(z, i);
   522      n = (int)(z-zBuf);
   523      assert( n>0 && n<=4 );
   524      z[0] = 0;
   525      z = zBuf;
   526      READ_UTF16BE(z, 1, c);
   527      assert( c==i );
   528      assert( (z-zBuf)==n );
   529    }
   530  }
   531  #endif /* SQLITE_TEST */
   532  #endif /* SQLITE_OMIT_UTF16 */