github.com/bytedance/sonic@v1.11.7-0.20240517092252-d2edb31b167b/utf8/utf8.go (about)

     1  /*
     2   * Copyright 2022 ByteDance Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package utf8
    18  
    19  import (
    20      `github.com/bytedance/sonic/internal/rt`
    21      `github.com/bytedance/sonic/internal/native/types`
    22      `github.com/bytedance/sonic/internal/native`
    23  )
    24  
    25  // CorrectWith corrects the invalid utf8 byte with repl string.
    26  func CorrectWith(dst []byte, src []byte, repl string) []byte {
    27      sstr := rt.Mem2Str(src)
    28      sidx := 0
    29  
    30      /* state machine records the invalid postions */
    31      m := types.NewStateMachine()
    32      m.Sp = 0 // invalid utf8 numbers
    33  
    34      for sidx < len(sstr) {
    35          scur  := sidx
    36          ecode := native.ValidateUTF8(&sstr, &sidx, m)
    37  
    38          if m.Sp != 0 {
    39              if m.Sp > len(sstr) {
    40                  panic("numbers of invalid utf8 exceed the string len!")
    41              }
    42          }
    43          
    44          for i := 0; i < m.Sp; i++ {
    45              ipos := m.Vt[i] // invalid utf8 position
    46              dst  = append(dst, sstr[scur:ipos]...)
    47              dst  = append(dst, repl...)
    48              scur = m.Vt[i] + 1
    49          }
    50          /* append the remained valid utf8 bytes */
    51          dst = append(dst, sstr[scur:sidx]...)
    52  
    53          /* not enough space, reset and continue */
    54          if ecode != 0 {
    55              m.Sp = 0
    56          }
    57      }
    58  
    59      types.FreeStateMachine(m)
    60      return dst
    61  }
    62  
    63  // Validate is a simd-accelereated drop-in replacement for the standard library's utf8.Valid.
    64  func Validate(src []byte) bool {
    65      return ValidateString(rt.Mem2Str(src))
    66  }
    67  
    68  // ValidateString as Validate, but for string.
    69  func ValidateString(src string) bool {
    70      return native.ValidateUTF8Fast(&src) == 0
    71  }