github.com/bytedance/sonic@v1.11.7-0.20240517092252-d2edb31b167b/utf8/utf8.go (about) 1 /* 2 * Copyright 2022 ByteDance Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package utf8 18 19 import ( 20 `github.com/bytedance/sonic/internal/rt` 21 `github.com/bytedance/sonic/internal/native/types` 22 `github.com/bytedance/sonic/internal/native` 23 ) 24 25 // CorrectWith corrects the invalid utf8 byte with repl string. 26 func CorrectWith(dst []byte, src []byte, repl string) []byte { 27 sstr := rt.Mem2Str(src) 28 sidx := 0 29 30 /* state machine records the invalid postions */ 31 m := types.NewStateMachine() 32 m.Sp = 0 // invalid utf8 numbers 33 34 for sidx < len(sstr) { 35 scur := sidx 36 ecode := native.ValidateUTF8(&sstr, &sidx, m) 37 38 if m.Sp != 0 { 39 if m.Sp > len(sstr) { 40 panic("numbers of invalid utf8 exceed the string len!") 41 } 42 } 43 44 for i := 0; i < m.Sp; i++ { 45 ipos := m.Vt[i] // invalid utf8 position 46 dst = append(dst, sstr[scur:ipos]...) 47 dst = append(dst, repl...) 48 scur = m.Vt[i] + 1 49 } 50 /* append the remained valid utf8 bytes */ 51 dst = append(dst, sstr[scur:sidx]...) 52 53 /* not enough space, reset and continue */ 54 if ecode != 0 { 55 m.Sp = 0 56 } 57 } 58 59 types.FreeStateMachine(m) 60 return dst 61 } 62 63 // Validate is a simd-accelereated drop-in replacement for the standard library's utf8.Valid. 64 func Validate(src []byte) bool { 65 return ValidateString(rt.Mem2Str(src)) 66 } 67 68 // ValidateString as Validate, but for string. 69 func ValidateString(src string) bool { 70 return native.ValidateUTF8Fast(&src) == 0 71 }