github.com/goshafaq/sonic@v0.0.0-20231026082336-871835fb94c6/utf8/utf8.go (about)

     1  /*
     2   * Copyright 2022 ByteDance Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package utf8
    18  
    19  import (
    20  	"github.com/goshafaq/sonic/internal/native"
    21  	"github.com/goshafaq/sonic/internal/native/types"
    22  	"github.com/goshafaq/sonic/internal/rt"
    23  )
    24  
    25  // CorrectWith corrects the invalid utf8 byte with repl string.
    26  func CorrectWith(dst []byte, src []byte, repl string) []byte {
    27  	sstr := rt.Mem2Str(src)
    28  	sidx := 0
    29  
    30  	/* state machine records the invalid postions */
    31  	m := types.NewStateMachine()
    32  	m.Sp = 0 // invalid utf8 numbers
    33  
    34  	for sidx < len(sstr) {
    35  		scur := sidx
    36  		ecode := native.ValidateUTF8(&sstr, &sidx, m)
    37  
    38  		if m.Sp != 0 {
    39  			if m.Sp > len(sstr) {
    40  				panic("numbers of invalid utf8 exceed the string len!")
    41  			}
    42  		}
    43  
    44  		for i := 0; i < m.Sp; i++ {
    45  			ipos := m.Vt[i] // invalid utf8 position
    46  			dst = append(dst, sstr[scur:ipos]...)
    47  			dst = append(dst, repl...)
    48  			scur = m.Vt[i] + 1
    49  		}
    50  		/* append the remained valid utf8 bytes */
    51  		dst = append(dst, sstr[scur:sidx]...)
    52  
    53  		/* not enough space, reset and continue */
    54  		if ecode != 0 {
    55  			m.Sp = 0
    56  		}
    57  	}
    58  
    59  	types.FreeStateMachine(m)
    60  	return dst
    61  }
    62  
    63  // Validate is a simd-accelereated drop-in replacement for the standard library's utf8.Valid.
    64  func Validate(src []byte) bool {
    65  	return ValidateString(rt.Mem2Str(src))
    66  }
    67  
    68  // ValidateString as Validate, but for string.
    69  func ValidateString(src string) bool {
    70  	return native.ValidateUTF8Fast(&src) == 0
    71  }