github.com/primecitizens/pcz/std@v0.2.1/text/unicode/wtf16/wtf16_test.go

github.com/primecitizens/pcz/std@v0.2.1/text/unicode/wtf16/wtf16_test.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2023 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  package wtf16
     9  
    10  import (
    11  	"fmt"
    12  	"slices"
    13  	"testing"
    14  	"unsafe"
    15  
    16  	stdstring "github.com/primecitizens/pcz/std/builtin/string"
    17  	"github.com/primecitizens/pcz/std/text/unicode/utf16"
    18  	"github.com/primecitizens/pcz/std/text/unicode/utf8"
    19  )
    20  
    21  var wtf8tests = []struct {
    22  	str  string
    23  	wstr []uint16
    24  }{
    25  	{
    26  		str:  "\x00",
    27  		wstr: []uint16{0x00},
    28  	},
    29  	{
    30  		str:  "\x5C",
    31  		wstr: []uint16{0x5C},
    32  	},
    33  	{
    34  		str:  "\x7F",
    35  		wstr: []uint16{0x7F},
    36  	},
    37  
    38  	// 2-byte
    39  	{
    40  		str:  "\xC2\x80",
    41  		wstr: []uint16{0x80},
    42  	},
    43  	{
    44  		str:  "\xD7\x8A",
    45  		wstr: []uint16{0x05CA},
    46  	},
    47  	{
    48  		str:  "\xDF\xBF",
    49  		wstr: []uint16{0x07FF},
    50  	},
    51  
    52  	// 3-byte
    53  	{
    54  		str:  "\xE0\xA0\x80",
    55  		wstr: []uint16{0x0800},
    56  	},
    57  	{
    58  		str:  "\xE2\xB0\xBC",
    59  		wstr: []uint16{0x2C3C},
    60  	},
    61  	{
    62  		str:  "\xEF\xBF\xBF",
    63  		wstr: []uint16{0xFFFF},
    64  	},
    65  	// unmatched surrogate halves
    66  	// high surrogates: 0xD800 to 0xDBFF
    67  	{
    68  		str:  "\xED\xA0\x80",
    69  		wstr: []uint16{0xD800},
    70  	},
    71  	{
    72  		// "High surrogate followed by another high surrogate"
    73  		str:  "\xED\xA0\x80\xED\xA0\x80",
    74  		wstr: []uint16{0xD800, 0xD800},
    75  	},
    76  	{
    77  		// "High surrogate followed by a symbol that is not a surrogate"
    78  		str:  string([]byte{0xED, 0xA0, 0x80, 0xA}),
    79  		wstr: []uint16{0xD800, 0xA},
    80  	},
    81  	{
    82  		// "Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate"
    83  		str:  string([]byte{0xED, 0xA0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xA0, 0x80}),
    84  		wstr: []uint16{0xD800, 0xD834, 0xDF06, 0xD800},
    85  	},
    86  	{
    87  		str:  "\xED\xA6\xAF",
    88  		wstr: []uint16{0xD9AF},
    89  	},
    90  	{
    91  		str:  "\xED\xAF\xBF",
    92  		wstr: []uint16{0xDBFF},
    93  	},
    94  	// low surrogates: 0xDC00 to 0xDFFF
    95  	{
    96  		str:  "\xED\xB0\x80",
    97  		wstr: []uint16{0xDC00},
    98  	},
    99  	{
   100  		// "Low surrogate followed by another low surrogate"
   101  		str:  "\xED\xB0\x80\xED\xB0\x80",
   102  		wstr: []uint16{0xDC00, 0xDC00},
   103  	},
   104  	{
   105  		// "Low surrogate followed by a symbol that is not a surrogate"
   106  		str:  string([]byte{0xED, 0xB0, 0x80, 0xA}),
   107  		wstr: []uint16{0xDC00, 0xA},
   108  	},
   109  	{
   110  		// "Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate"
   111  		str:  string([]byte{0xED, 0xB0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xB0, 0x80}),
   112  		wstr: []uint16{0xDC00, 0xD834, 0xDF06, 0xDC00},
   113  	},
   114  	{
   115  		str:  "\xED\xBB\xAE",
   116  		wstr: []uint16{0xDEEE},
   117  	},
   118  	{
   119  		str:  "\xED\xBF\xBF",
   120  		wstr: []uint16{0xDFFF},
   121  	},
   122  
   123  	// 4-byte
   124  	{
   125  		str:  "\xF0\x90\x80\x80",
   126  		wstr: []uint16{0xD800, 0xDC00},
   127  	},
   128  	{
   129  		str:  "\xF0\x9D\x8C\x86",
   130  		wstr: []uint16{0xD834, 0xDF06},
   131  	},
   132  	{
   133  		str:  "\xF4\x8F\xBF\xBF",
   134  		wstr: []uint16{0xDBFF, 0xDFFF},
   135  	},
   136  }
   137  
   138  func TestWTF16Rountrip(t *testing.T) {
   139  	for _, tt := range wtf8tests {
   140  		t.Run(fmt.Sprintf("%X", tt.str), func(t *testing.T) {
   141  			got := Encode(nil, tt.str)
   142  			got2 := string(WTF8DecodeAll(nil, got...))
   143  			if got2 != tt.str {
   144  				t.Errorf("got:\n%s\nwant:\n%s", got2, tt.str)
   145  			}
   146  		})
   147  	}
   148  }
   149  
   150  func TestWTF16Golden(t *testing.T) {
   151  	for _, tt := range wtf8tests {
   152  		t.Run(fmt.Sprintf("%X", tt.str), func(t *testing.T) {
   153  			got := Encode(nil, tt.str)
   154  			if !slices.Equal(got, tt.wstr) {
   155  				t.Errorf("got:\n%v\nwant:\n%v", got, tt.wstr)
   156  			}
   157  		})
   158  	}
   159  }
   160  
   161  func FuzzEncodeWTF16(f *testing.F) {
   162  	for _, tt := range wtf8tests {
   163  		f.Add(tt.str)
   164  	}
   165  	f.Fuzz(func(t *testing.T, b string) {
   166  		// test that there are no panics
   167  		got := Encode(nil, b)
   168  		WTF8Decode(nil, got...)
   169  		if utf8.Valid(b) {
   170  			// if the input is a valid UTF-8 string, then
   171  			// test that encodeWTF16 behaves as
   172  			// utf16.Encode
   173  			want := utf16.AppendRunes(nil, []rune(b)...)
   174  			if !slices.Equal(got, want) {
   175  				t.Errorf("got:\n%v\nwant:\n%v", got, want)
   176  			}
   177  		}
   178  	})
   179  }
   180  
   181  func FuzzDecodeWTF16(f *testing.F) {
   182  	for _, tt := range wtf8tests {
   183  		b := unsafe.Slice((*uint8)(unsafe.Pointer(unsafe.SliceData(tt.wstr))), len(tt.wstr)*2)
   184  		f.Add(b)
   185  	}
   186  	f.Fuzz(func(t *testing.T, b []byte) {
   187  		u16 := unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(b))), len(b)/2)
   188  		got := WTF8DecodeAll(nil, u16...)
   189  		if utf8.Valid(stdstring.FromBytes(got)) {
   190  			// if the input is a valid UTF-8 string, then
   191  			// test that decodeWTF16 behaves as
   192  			// utf16.Decode
   193  			want := utf16.RunesAppend(nil, u16...)
   194  			if string(got) != string(want) {
   195  				t.Errorf("got:\n%s\nwant:\n%s", string(got), string(want))
   196  			}
   197  		}
   198  		// WTF-8 should always roundtrip
   199  		got2 := Encode(nil, string(got))
   200  		if !slices.Equal(got2, u16) {
   201  			t.Errorf("got:\n%v\nwant:\n%v", got2, u16)
   202  		}
   203  	})
   204  }