github.com/cloudwego/dynamicgo@v0.2.6-0.20240519101509-707f41b6b834/native/utf8.c (about)

     1  /*
     2   * Copyright (c) 2009 The Go Authors. All rights reserved.
     3   * Modifications Copyright 2023 CloudWeGo Authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  #include "native.h"
    19  
    20  // ascii: 0x00 ~ 0x7F
    21  static inline int _mm_ascii_mask(__m128i vv)
    22  {
    23      return _mm_movemask_epi8(vv);
    24  }
    25  
    26  #if USE_AVX2
    27  
    28  // ascii: 0x00 ~ 0x7F
    29  static inline int _mm256_ascii_mask(__m256i vv)
    30  {
    31      return _mm256_movemask_epi8(vv);
    32  }
    33  
    34  #endif
    35  
    36  static inline bool is_ascii(uint8_t ch)
    37  {
    38      return ch < 0x80;
    39  }
    40  
    41  // The default lowest and highest continuation byte.
    42  const static uint8_t locb = 0x80;
    43  const static uint8_t hicb = 0xBF;
    44  const static uint8_t xx = 0xF1; // invalid: size 1
    45  const static uint8_t as = 0xF0; // ASCII: size 1
    46  const static uint8_t s1 = 0x02; // accept 0, size 2
    47  const static uint8_t s2 = 0x13; // accept 1, size 3
    48  const static uint8_t s3 = 0x03; // accept 0, size 3
    49  const static uint8_t s4 = 0x23; // accept 2, size 3
    50  const static uint8_t s5 = 0x34; // accept 3, size 4
    51  const static uint8_t s6 = 0x04; // accept 0, size 4
    52  const static uint8_t s7 = 0x44; // accept 4, size 4
    53  
    54  // first is information about the first byte in a UTF-8 sequence.
    55  static const uint8_t first[256] = {
    56      //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    57      as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
    58      as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
    59      as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
    60      as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
    61      as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
    62      as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
    63      as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
    64      as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
    65      //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    66      xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
    67      xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
    68      xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
    69      xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
    70      xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
    71      s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
    72      s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
    73      s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
    74  };
    75  
    76  // AcceptRange gives the range of valid values for the second byte in a UTF-8
    77  // sequence.
    78  struct AcceptRange
    79  {
    80      uint8_t lo; // lowest value for second byte.
    81      uint8_t hi; // highest value for second byte.
    82  };
    83  
    84  // ranges has size 16 to avoid bounds checks in the code that uses it.
    85  const static struct AcceptRange ranges[5] = {
    86      {locb, hicb}, // 0
    87      {0xA0, hicb}, // 1
    88      {locb, 0x9F}, // 2
    89      {0x90, hicb}, // 3
    90      {locb, 0x8F}, // 4
    91  };
    92  
    93  //  UTF-8 code point  | first byte | second byte | third byte | fourth byte
    94  //  U+0000  -  U+007F | 0___ ____
    95  //  U+0080  -  U+07FF | 110_ ____  | 10__ ____
    96  //  U+0800  -  U+D7FF | 1110 ____  | 10__ ____   | 10__ ____
    97  //  U+D800  -  U+DFFF | reserved for UTF-16 surrogate pairs
    98  //  U+E000  -  U+FFFF | 1110 ____  | 10__ ____   | 10__ ____
    99  // U+10000 - U+10FFFF | 1111 0___  | 10__ ____   | 10__ ____  | 10__ ____
   100  // checks non-ascii characters, and returns the utf-8 length
   101  static inline ssize_t nonascii_is_utf8(const uint8_t *sp, size_t n)
   102  {
   103      uint8_t mask = first[sp[0]];
   104      uint8_t size = mask & 7;
   105      if (n < size)
   106      {
   107          return 0;
   108      }
   109      struct AcceptRange accept = ranges[mask >> 4];
   110      switch (size)
   111      {
   112      case 4:
   113          if (sp[3] < locb || hicb < sp[3])
   114              return 0;
   115      case 3:
   116          if (sp[2] < locb || hicb < sp[2])
   117              return 0;
   118      case 2:
   119          if (sp[1] < accept.lo || accept.hi < sp[1])
   120              return 0;
   121          break;
   122      case 1:
   123          return 0; // invalid chars
   124      case 0:
   125          return 1; // ascii chars
   126      default:
   127          return 0;
   128      }
   129      return size;
   130  }
   131  
   132  ssize_t find_non_ascii(const uint8_t *sp, ssize_t nb)
   133  {
   134      const uint8_t *ss = sp;
   135      int64_t m;
   136  
   137  #if USE_AVX2
   138      while (nb >= 32)
   139      {
   140          __m256i v = _mm256_loadu_si256((const void *)(sp));
   141          if (unlikely((m = _mm256_ascii_mask(v)) != 0))
   142          {
   143              return sp - ss + __builtin_ctzll(m);
   144          }
   145          nb -= 32;
   146          sp += 32;
   147      }
   148  
   149      /* clear spper half to avoid AVX-SSE transition penalty */
   150      _mm256_zeroupper();
   151  #endif
   152  
   153      while (nb >= 16)
   154      {
   155          __m128i v = _mm_loadu_si128((const void *)(sp));
   156          if (unlikely((m = _mm_ascii_mask(v)) != 0))
   157          {
   158              return sp - ss + __builtin_ctzll(m);
   159          }
   160          nb -= 16;
   161          sp += 16;
   162      }
   163  
   164      /* remaining bytes, do with scalar code */
   165      while (nb-- > 0)
   166      {
   167          if (is_ascii(*sp))
   168          {
   169              sp++;
   170          }
   171          else
   172          {
   173              return sp - ss;
   174          }
   175      }
   176  
   177      /* nothing found */
   178      return -1;
   179  }
   180  
   181  // utf8_validate validates whether the JSON string is valid UTF-8.
   182  // return -1 if validate, otherwise, return the error postion.
   183  ssize_t utf8_validate(const char *sp, ssize_t nb)
   184  {
   185      const uint8_t *p = (const uint8_t *)sp;
   186      const uint8_t *s = (const uint8_t *)sp;
   187      ssize_t n;
   188      ssize_t b;
   189  
   190      // Optimize for the continuous non-ascii chars */
   191      while (nb > 0 && (n = (!is_ascii(*p) ? 0 : find_non_ascii(p, nb))) != -1)
   192      {
   193          /* not found non-ascii in string */
   194          if (n >= nb)
   195          {
   196              return -1;
   197          }
   198  
   199          nb -= n;
   200          p += n;
   201  
   202          /* validate the non-ascii */
   203          if (unlikely((b = nonascii_is_utf8(p, nb)) == 0))
   204          {
   205              return p - s;
   206          }
   207  
   208          nb -= b;
   209          p += b;
   210      }
   211  
   212      return -1;
   213  }