github.com/goshafaq/sonic@v0.0.0-20231026082336-871835fb94c6/utf8/utf8_test.go (about)

     1  /*
     2   * Copyright 2022 ByteDance Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package utf8
    18  
    19  import (
    20  	"bytes"
    21  	"github.com/stretchr/testify/assert"
    22  	"math/rand"
    23  	"strings"
    24  	"testing"
    25  	"unicode/utf8"
    26  )
    27  
    28  var (
    29  	_Header_2Bytes  = string([]byte{0xC0})
    30  	_Header_3Bytes  = string([]byte{0xE0})
    31  	_Header_4Bytes  = string([]byte{0xF0})
    32  	_Low_Surrogate  = string([]byte{0xED, 0xA0, 0x80}) // \ud800
    33  	_High_Surrogate = string([]byte{0xED, 0xB0, 0x80}) // \udc00
    34  	_Cont           = "\xb0"
    35  )
    36  
    37  func TestCorrectWith_InvalidUtf8(t *testing.T) {
    38  	var tests = []struct {
    39  		name   string
    40  		input  string
    41  		expect string
    42  		errpos int
    43  	}{
    44  		{"basic", `abc`, "abc", -1},
    45  		{"long", strings.Repeat("helloα,景😊", 1000), strings.Repeat("helloα,景😊", 1000), -1},
    46  
    47  		// invalid utf8 - single byte
    48  		{"single_Cont", _Cont, "\ufffd", 0},
    49  		{"single_Header_2Bytes", _Header_2Bytes, "\ufffd", 0},
    50  		{"single_Header_3Bytes", _Header_3Bytes, "\ufffd", 0},
    51  		{"single_Header_4Bytes", _Header_4Bytes, "\ufffd", 0},
    52  
    53  		// invalid utf8 - two bytes
    54  		{"two_Header_2Bytes + _Cont", _Header_2Bytes + _Cont, "\ufffd\ufffd", 0},
    55  		{`two_Header_4Bytes + _Cont+ "xx"`, _Header_4Bytes + _Cont + "xx", "\ufffd\ufffdxx", 0},
    56  		{`"xx" + three_Header_4Bytes + _Cont + _Cont`, "xx" + _Header_4Bytes + _Cont + _Cont, "xx\ufffd\ufffd\ufffd", 2},
    57  
    58  		// invalid utf8 - three bytes
    59  		{`three_Low_Surrogate`, _Low_Surrogate, "\ufffd\ufffd\ufffd", 0},
    60  		{`three__High_Surrogate`, _High_Surrogate, "\ufffd\ufffd\ufffd", 0},
    61  
    62  		// invalid utf8 - multi bytes
    63  		{`_High_Surrogate + _Low_Surrogate`, _High_Surrogate + _Low_Surrogate, "\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd", 0},
    64  		{`"\x80\x80\x80\x80"`, "\x80\x80\x80\x80", "\ufffd\ufffd\ufffd\ufffd", 0},
    65  	}
    66  	for _, test := range tests {
    67  		got := CorrectWith(nil, []byte(test.input), "\ufffd")
    68  		assert.Equal(t, []byte(test.expect), got, test.name)
    69  		assert.Equal(t, test.errpos == -1, utf8.ValidString(test.input), test.name)
    70  	}
    71  }
    72  
    73  func genRandBytes(length int) []byte {
    74  	var buf bytes.Buffer
    75  	for j := 0; j < length; j++ {
    76  		buf.WriteByte(byte(rand.Intn(0xFF + 1)))
    77  	}
    78  	return buf.Bytes()
    79  }
    80  
    81  func genRandAscii(length int) []byte {
    82  	var buf bytes.Buffer
    83  	for j := 0; j < length; j++ {
    84  		buf.WriteByte(byte(rand.Intn(0x7F + 1)))
    85  	}
    86  	return buf.Bytes()
    87  }
    88  
    89  func genRandRune(length int) []byte {
    90  	var buf bytes.Buffer
    91  	for j := 0; j < length; j++ {
    92  		buf.WriteRune(rune(rand.Intn(0x10FFFF + 1)))
    93  	}
    94  	return buf.Bytes()
    95  }
    96  
    97  func TestValidate_Random(t *testing.T) {
    98  	// compare with stdlib
    99  	compare := func(t *testing.T, data []byte) {
   100  		assert.Equal(t, utf8.Valid(data), Validate(data), string(data))
   101  	}
   102  
   103  	// random testing
   104  	nums := 1000
   105  	maxLen := 1000
   106  	for i := 0; i < nums; i++ {
   107  		length := rand.Intn(maxLen)
   108  		compare(t, genRandBytes(length))
   109  		compare(t, genRandRune(length))
   110  	}
   111  }
   112  
   113  func BenchmarkValidate(b *testing.B) {
   114  	bench := []struct {
   115  		name string
   116  		data []byte
   117  	}{
   118  		{"ValidAscii", genRandAscii(1000)},
   119  		{"ValidUTF8", genRandRune(1000)},
   120  		{"RandomBytes", genRandBytes(1000)},
   121  	}
   122  
   123  	for _, test := range bench {
   124  		if utf8.Valid(test.data) != Validate(test.data) {
   125  			b.Fatalf("sonic utf8 validate wrong for %s string: %v", test.name, test.data)
   126  		}
   127  		b.Run("Sonic_"+test.name, func(b *testing.B) {
   128  			for i := 0; i < b.N; i++ {
   129  				Validate(test.data)
   130  			}
   131  		})
   132  		b.Run("StdLib_"+test.name, func(b *testing.B) {
   133  			for i := 0; i < b.N; i++ {
   134  				utf8.Valid(test.data)
   135  			}
   136  		})
   137  	}
   138  }