github.com/goshafaq/sonic@v0.0.0-20231026082336-871835fb94c6/utf8/utf8_test.go (about) 1 /* 2 * Copyright 2022 ByteDance Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package utf8 18 19 import ( 20 "bytes" 21 "github.com/stretchr/testify/assert" 22 "math/rand" 23 "strings" 24 "testing" 25 "unicode/utf8" 26 ) 27 28 var ( 29 _Header_2Bytes = string([]byte{0xC0}) 30 _Header_3Bytes = string([]byte{0xE0}) 31 _Header_4Bytes = string([]byte{0xF0}) 32 _Low_Surrogate = string([]byte{0xED, 0xA0, 0x80}) // \ud800 33 _High_Surrogate = string([]byte{0xED, 0xB0, 0x80}) // \udc00 34 _Cont = "\xb0" 35 ) 36 37 func TestCorrectWith_InvalidUtf8(t *testing.T) { 38 var tests = []struct { 39 name string 40 input string 41 expect string 42 errpos int 43 }{ 44 {"basic", `abc`, "abc", -1}, 45 {"long", strings.Repeat("helloα,景😊", 1000), strings.Repeat("helloα,景😊", 1000), -1}, 46 47 // invalid utf8 - single byte 48 {"single_Cont", _Cont, "\ufffd", 0}, 49 {"single_Header_2Bytes", _Header_2Bytes, "\ufffd", 0}, 50 {"single_Header_3Bytes", _Header_3Bytes, "\ufffd", 0}, 51 {"single_Header_4Bytes", _Header_4Bytes, "\ufffd", 0}, 52 53 // invalid utf8 - two bytes 54 {"two_Header_2Bytes + _Cont", _Header_2Bytes + _Cont, "\ufffd\ufffd", 0}, 55 {`two_Header_4Bytes + _Cont+ "xx"`, _Header_4Bytes + _Cont + "xx", "\ufffd\ufffdxx", 0}, 56 {`"xx" + three_Header_4Bytes + _Cont + _Cont`, "xx" + _Header_4Bytes + _Cont + _Cont, "xx\ufffd\ufffd\ufffd", 2}, 57 58 // invalid utf8 - three bytes 59 {`three_Low_Surrogate`, _Low_Surrogate, "\ufffd\ufffd\ufffd", 0}, 60 {`three__High_Surrogate`, _High_Surrogate, "\ufffd\ufffd\ufffd", 0}, 61 62 // invalid utf8 - multi bytes 63 {`_High_Surrogate + _Low_Surrogate`, _High_Surrogate + _Low_Surrogate, "\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd", 0}, 64 {`"\x80\x80\x80\x80"`, "\x80\x80\x80\x80", "\ufffd\ufffd\ufffd\ufffd", 0}, 65 } 66 for _, test := range tests { 67 got := CorrectWith(nil, []byte(test.input), "\ufffd") 68 assert.Equal(t, []byte(test.expect), got, test.name) 69 assert.Equal(t, test.errpos == -1, utf8.ValidString(test.input), test.name) 70 } 71 } 72 73 func genRandBytes(length int) []byte { 74 var buf bytes.Buffer 75 for j := 0; j < length; j++ { 76 buf.WriteByte(byte(rand.Intn(0xFF + 1))) 77 } 78 return buf.Bytes() 79 } 80 81 func genRandAscii(length int) []byte { 82 var buf bytes.Buffer 83 for j := 0; j < length; j++ { 84 buf.WriteByte(byte(rand.Intn(0x7F + 1))) 85 } 86 return buf.Bytes() 87 } 88 89 func genRandRune(length int) []byte { 90 var buf bytes.Buffer 91 for j := 0; j < length; j++ { 92 buf.WriteRune(rune(rand.Intn(0x10FFFF + 1))) 93 } 94 return buf.Bytes() 95 } 96 97 func TestValidate_Random(t *testing.T) { 98 // compare with stdlib 99 compare := func(t *testing.T, data []byte) { 100 assert.Equal(t, utf8.Valid(data), Validate(data), string(data)) 101 } 102 103 // random testing 104 nums := 1000 105 maxLen := 1000 106 for i := 0; i < nums; i++ { 107 length := rand.Intn(maxLen) 108 compare(t, genRandBytes(length)) 109 compare(t, genRandRune(length)) 110 } 111 } 112 113 func BenchmarkValidate(b *testing.B) { 114 bench := []struct { 115 name string 116 data []byte 117 }{ 118 {"ValidAscii", genRandAscii(1000)}, 119 {"ValidUTF8", genRandRune(1000)}, 120 {"RandomBytes", genRandBytes(1000)}, 121 } 122 123 for _, test := range bench { 124 if utf8.Valid(test.data) != Validate(test.data) { 125 b.Fatalf("sonic utf8 validate wrong for %s string: %v", test.name, test.data) 126 } 127 b.Run("Sonic_"+test.name, func(b *testing.B) { 128 for i := 0; i < b.N; i++ { 129 Validate(test.data) 130 } 131 }) 132 b.Run("StdLib_"+test.name, func(b *testing.B) { 133 for i := 0; i < b.N; i++ { 134 utf8.Valid(test.data) 135 } 136 }) 137 } 138 }