github.com/liquid-dev/text@v0.3.3-liquid/encoding/unicode/unicode_test.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package unicode
     6  
     7  import (
     8  	"testing"
     9  
    10  	"github.com/liquid-dev/text/encoding"
    11  	"github.com/liquid-dev/text/encoding/charmap"
    12  	"github.com/liquid-dev/text/encoding/internal/enctest"
    13  	"github.com/liquid-dev/text/transform"
    14  )
    15  
    16  func TestBasics(t *testing.T) {
    17  	testCases := []struct {
    18  		e         encoding.Encoding
    19  		encPrefix string
    20  		encSuffix string
    21  		encoded   string
    22  		utf8      string
    23  	}{{
    24  		e:       utf16BEIB,
    25  		encoded: "\x00\x57\x00\xe4\xd8\x35\xdd\x65",
    26  		utf8:    "\x57\u00e4\U0001d565",
    27  	}, {
    28  		e:         utf16BEEB,
    29  		encPrefix: "\xfe\xff",
    30  		encoded:   "\x00\x57\x00\xe4\xd8\x35\xdd\x65",
    31  		utf8:      "\x57\u00e4\U0001d565",
    32  	}, {
    33  		e:       utf16LEIB,
    34  		encoded: "\x57\x00\xe4\x00\x35\xd8\x65\xdd",
    35  		utf8:    "\x57\u00e4\U0001d565",
    36  	}, {
    37  		e:         utf16LEEB,
    38  		encPrefix: "\xff\xfe",
    39  		encoded:   "\x57\x00\xe4\x00\x35\xd8\x65\xdd",
    40  		utf8:      "\x57\u00e4\U0001d565",
    41  	}}
    42  
    43  	for _, tc := range testCases {
    44  		enctest.TestEncoding(t, tc.e, tc.encoded, tc.utf8, tc.encPrefix, tc.encSuffix)
    45  	}
    46  }
    47  
    48  func TestFiles(t *testing.T) {
    49  	enctest.TestFile(t, UTF8)
    50  	enctest.TestFile(t, utf16LEIB)
    51  }
    52  
    53  func BenchmarkEncoding(b *testing.B) {
    54  	enctest.Benchmark(b, UTF8)
    55  	enctest.Benchmark(b, utf16LEIB)
    56  }
    57  
    58  var (
    59  	utf16LEIB = UTF16(LittleEndian, IgnoreBOM) // UTF-16LE (atypical interpretation)
    60  	utf16LEUB = UTF16(LittleEndian, UseBOM)    // UTF-16, LE
    61  	utf16LEEB = UTF16(LittleEndian, ExpectBOM) // UTF-16, LE, Expect
    62  	utf16BEIB = UTF16(BigEndian, IgnoreBOM)    // UTF-16BE (atypical interpretation)
    63  	utf16BEUB = UTF16(BigEndian, UseBOM)       // UTF-16 default
    64  	utf16BEEB = UTF16(BigEndian, ExpectBOM)    // UTF-16 Expect
    65  )
    66  
    67  func TestUTF16(t *testing.T) {
    68  	testCases := []struct {
    69  		desc    string
    70  		src     string
    71  		notEOF  bool // the inverse of atEOF
    72  		sizeDst int
    73  		want    string
    74  		nSrc    int
    75  		err     error
    76  		t       transform.Transformer
    77  	}{{
    78  		desc: "utf-16 IgnoreBOM dec: empty string",
    79  		t:    utf16BEIB.NewDecoder(),
    80  	}, {
    81  		desc: "utf-16 UseBOM dec: empty string",
    82  		t:    utf16BEUB.NewDecoder(),
    83  	}, {
    84  		desc: "utf-16 ExpectBOM dec: empty string",
    85  		err:  ErrMissingBOM,
    86  		t:    utf16BEEB.NewDecoder(),
    87  	}, {
    88  		desc:    "utf-16 dec: BOM determines encoding BE (RFC 2781:3.3)",
    89  		src:     "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61",
    90  		sizeDst: 100,
    91  		want:    "\U00012345=Ra",
    92  		nSrc:    12,
    93  		t:       utf16BEUB.NewDecoder(),
    94  	}, {
    95  		desc:    "utf-16 dec: BOM determines encoding LE (RFC 2781:3.3)",
    96  		src:     "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00",
    97  		sizeDst: 100,
    98  		want:    "\U00012345=Ra",
    99  		nSrc:    12,
   100  		t:       utf16LEUB.NewDecoder(),
   101  	}, {
   102  		desc:    "utf-16 dec: BOM determines encoding LE, change default (RFC 2781:3.3)",
   103  		src:     "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00",
   104  		sizeDst: 100,
   105  		want:    "\U00012345=Ra",
   106  		nSrc:    12,
   107  		t:       utf16BEUB.NewDecoder(),
   108  	}, {
   109  		desc:    "utf-16 dec: Fail on missing BOM when required",
   110  		src:     "\x08\xD8\x45\xDF\x3D\x00\xFF\xFE\xFE\xFF\x00\x52\x00\x61",
   111  		sizeDst: 100,
   112  		want:    "",
   113  		nSrc:    0,
   114  		err:     ErrMissingBOM,
   115  		t:       utf16BEEB.NewDecoder(),
   116  	}, {
   117  		desc:    "utf-16 dec: SHOULD interpret text as big-endian when BOM not present (RFC 2781:4.3)",
   118  		src:     "\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61",
   119  		sizeDst: 100,
   120  		want:    "\U00012345=Ra",
   121  		nSrc:    10,
   122  		t:       utf16BEUB.NewDecoder(),
   123  	}, {
   124  		// This is an error according to RFC 2781. But errors in RFC 2781 are
   125  		// open to interpretations, so I guess this is fine.
   126  		desc:    "utf-16le dec: incorrect BOM is an error (RFC 2781:4.1)",
   127  		src:     "\xFE\xFF\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00",
   128  		sizeDst: 100,
   129  		want:    "\uFFFE\U00012345=Ra",
   130  		nSrc:    12,
   131  		t:       utf16LEIB.NewDecoder(),
   132  	}, {
   133  		desc:    "utf-16 enc: SHOULD write BOM (RFC 2781:3.3)",
   134  		src:     "\U00012345=Ra",
   135  		sizeDst: 100,
   136  		want:    "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00",
   137  		nSrc:    7,
   138  		t:       utf16LEUB.NewEncoder(),
   139  	}, {
   140  		desc:    "utf-16 enc: SHOULD write BOM (RFC 2781:3.3)",
   141  		src:     "\U00012345=Ra",
   142  		sizeDst: 100,
   143  		want:    "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61",
   144  		nSrc:    7,
   145  		t:       utf16BEUB.NewEncoder(),
   146  	}, {
   147  		desc:    "utf-16le enc: MUST NOT write BOM (RFC 2781:3.3)",
   148  		src:     "\U00012345=Ra",
   149  		sizeDst: 100,
   150  		want:    "\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00",
   151  		nSrc:    7,
   152  		t:       utf16LEIB.NewEncoder(),
   153  	}, {
   154  		desc:    "utf-16be dec: incorrect UTF-16: odd bytes",
   155  		src:     "\x00",
   156  		sizeDst: 100,
   157  		want:    "\uFFFD",
   158  		nSrc:    1,
   159  		t:       utf16BEIB.NewDecoder(),
   160  	}, {
   161  		desc:    "utf-16be dec: unpaired surrogate, odd bytes",
   162  		src:     "\xD8\x45\x00",
   163  		sizeDst: 100,
   164  		want:    "\uFFFD\uFFFD",
   165  		nSrc:    3,
   166  		t:       utf16BEIB.NewDecoder(),
   167  	}, {
   168  		desc:    "utf-16be dec: unpaired low surrogate + valid text",
   169  		src:     "\xD8\x45\x00a",
   170  		sizeDst: 100,
   171  		want:    "\uFFFDa",
   172  		nSrc:    4,
   173  		t:       utf16BEIB.NewDecoder(),
   174  	}, {
   175  		desc:    "utf-16be dec: unpaired low surrogate + valid text + single byte",
   176  		src:     "\xD8\x45\x00ab",
   177  		sizeDst: 100,
   178  		want:    "\uFFFDa\uFFFD",
   179  		nSrc:    5,
   180  		t:       utf16BEIB.NewDecoder(),
   181  	}, {
   182  		desc:    "utf-16le dec: unpaired high surrogate",
   183  		src:     "\x00\x00\x00\xDC\x12\xD8",
   184  		sizeDst: 100,
   185  		want:    "\x00\uFFFD\uFFFD",
   186  		nSrc:    6,
   187  		t:       utf16LEIB.NewDecoder(),
   188  	}, {
   189  		desc:    "utf-16be dec: two unpaired low surrogates",
   190  		src:     "\xD8\x45\xD8\x12",
   191  		sizeDst: 100,
   192  		want:    "\uFFFD\uFFFD",
   193  		nSrc:    4,
   194  		t:       utf16BEIB.NewDecoder(),
   195  	}, {
   196  		desc:    "utf-16be dec: short dst",
   197  		src:     "\x00a",
   198  		sizeDst: 0,
   199  		want:    "",
   200  		nSrc:    0,
   201  		t:       utf16BEIB.NewDecoder(),
   202  		err:     transform.ErrShortDst,
   203  	}, {
   204  		desc:    "utf-16be dec: short dst surrogate",
   205  		src:     "\xD8\xF5\xDC\x12",
   206  		sizeDst: 3,
   207  		want:    "",
   208  		nSrc:    0,
   209  		t:       utf16BEIB.NewDecoder(),
   210  		err:     transform.ErrShortDst,
   211  	}, {
   212  		desc:    "utf-16be dec: short dst trailing byte",
   213  		src:     "\x00",
   214  		sizeDst: 2,
   215  		want:    "",
   216  		nSrc:    0,
   217  		t:       utf16BEIB.NewDecoder(),
   218  		err:     transform.ErrShortDst,
   219  	}, {
   220  		desc:    "utf-16be dec: short src",
   221  		src:     "\x00",
   222  		notEOF:  true,
   223  		sizeDst: 3,
   224  		want:    "",
   225  		nSrc:    0,
   226  		t:       utf16BEIB.NewDecoder(),
   227  		err:     transform.ErrShortSrc,
   228  	}, {
   229  		desc:    "utf-16 enc",
   230  		src:     "\U00012345=Ra",
   231  		sizeDst: 100,
   232  		want:    "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61",
   233  		nSrc:    7,
   234  		t:       utf16BEUB.NewEncoder(),
   235  	}, {
   236  		desc:    "utf-16 enc: short dst normal",
   237  		src:     "\U00012345=Ra",
   238  		sizeDst: 9,
   239  		want:    "\xD8\x08\xDF\x45\x00\x3D\x00\x52",
   240  		nSrc:    6,
   241  		t:       utf16BEIB.NewEncoder(),
   242  		err:     transform.ErrShortDst,
   243  	}, {
   244  		desc:    "utf-16 enc: short dst surrogate",
   245  		src:     "\U00012345=Ra",
   246  		sizeDst: 3,
   247  		want:    "",
   248  		nSrc:    0,
   249  		t:       utf16BEIB.NewEncoder(),
   250  		err:     transform.ErrShortDst,
   251  	}, {
   252  		desc:    "utf-16 enc: short src",
   253  		src:     "\U00012345=Ra\xC2",
   254  		notEOF:  true,
   255  		sizeDst: 100,
   256  		want:    "\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61",
   257  		nSrc:    7,
   258  		t:       utf16BEIB.NewEncoder(),
   259  		err:     transform.ErrShortSrc,
   260  	}, {
   261  		desc:    "utf-16be dec: don't change byte order mid-stream",
   262  		src:     "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\xFF\xFE\x00\x52\x00\x61",
   263  		sizeDst: 100,
   264  		want:    "\U00012345=\ufffeRa",
   265  		nSrc:    14,
   266  		t:       utf16BEUB.NewDecoder(),
   267  	}, {
   268  		desc:    "utf-16le dec: don't change byte order mid-stream",
   269  		src:     "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\xFF\xFE\xFE\xFF\x52\x00\x61\x00",
   270  		sizeDst: 100,
   271  		want:    "\U00012345=\ufeff\ufffeRa",
   272  		nSrc:    16,
   273  		t:       utf16LEUB.NewDecoder(),
   274  	}}
   275  	for i, tc := range testCases {
   276  		b := make([]byte, tc.sizeDst)
   277  		nDst, nSrc, err := tc.t.Transform(b, []byte(tc.src), !tc.notEOF)
   278  		if err != tc.err {
   279  			t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
   280  		}
   281  		if got := string(b[:nDst]); got != tc.want {
   282  			t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
   283  		}
   284  		if nSrc != tc.nSrc {
   285  			t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
   286  		}
   287  	}
   288  }
   289  
   290  func TestUTF8Decoder(t *testing.T) {
   291  	testCases := []struct {
   292  		desc    string
   293  		src     string
   294  		notEOF  bool // the inverse of atEOF
   295  		sizeDst int
   296  		want    string
   297  		nSrc    int
   298  		err     error
   299  	}{{
   300  		desc: "empty string, empty dest buffer",
   301  	}, {
   302  		desc:    "empty string",
   303  		sizeDst: 8,
   304  	}, {
   305  		desc:    "empty string, streaming",
   306  		notEOF:  true,
   307  		sizeDst: 8,
   308  	}, {
   309  		desc:    "ascii",
   310  		src:     "abcde",
   311  		sizeDst: 8,
   312  		want:    "abcde",
   313  		nSrc:    5,
   314  	}, {
   315  		desc:    "ascii and error",
   316  		src:     "ab\x80de",
   317  		sizeDst: 7,
   318  		want:    "ab\ufffdde",
   319  		nSrc:    5,
   320  	}, {
   321  		desc:    "valid two-byte sequence",
   322  		src:     "a\u0300bc",
   323  		sizeDst: 7,
   324  		want:    "a\u0300bc",
   325  		nSrc:    5,
   326  	}, {
   327  		desc:    "valid three-byte sequence",
   328  		src:     "a\u0300中",
   329  		sizeDst: 7,
   330  		want:    "a\u0300中",
   331  		nSrc:    6,
   332  	}, {
   333  		desc:    "valid four-byte sequence",
   334  		src:     "a中\U00016F50",
   335  		sizeDst: 8,
   336  		want:    "a中\U00016F50",
   337  		nSrc:    8,
   338  	}, {
   339  		desc:    "short source buffer",
   340  		src:     "abc\xf0\x90",
   341  		notEOF:  true,
   342  		sizeDst: 10,
   343  		want:    "abc",
   344  		nSrc:    3,
   345  		err:     transform.ErrShortSrc,
   346  	}, {
   347  		// We don't check for the maximal subpart of an ill-formed subsequence
   348  		// at the end of an open segment.
   349  		desc:    "complete invalid that looks like short at end",
   350  		src:     "abc\xf0\x80",
   351  		notEOF:  true,
   352  		sizeDst: 10,
   353  		want:    "abc", // instead of "abc\ufffd\ufffd",
   354  		nSrc:    3,
   355  		err:     transform.ErrShortSrc,
   356  	}, {
   357  		desc:    "incomplete sequence at end",
   358  		src:     "a\x80bc\xf0\x90",
   359  		sizeDst: 9,
   360  		want:    "a\ufffdbc\ufffd",
   361  		nSrc:    6,
   362  	}, {
   363  		desc:    "invalid second byte",
   364  		src:     "abc\xf0dddd",
   365  		sizeDst: 10,
   366  		want:    "abc\ufffddddd",
   367  		nSrc:    8,
   368  	}, {
   369  		desc:    "invalid second byte at end",
   370  		src:     "abc\xf0d",
   371  		sizeDst: 10,
   372  		want:    "abc\ufffdd",
   373  		nSrc:    5,
   374  	}, {
   375  		desc:    "invalid third byte",
   376  		src:     "a\u0300bc\xf0\x90dddd",
   377  		sizeDst: 12,
   378  		want:    "a\u0300bc\ufffddddd",
   379  		nSrc:    11,
   380  	}, {
   381  		desc:    "invalid third byte at end",
   382  		src:     "a\u0300bc\xf0\x90d",
   383  		sizeDst: 12,
   384  		want:    "a\u0300bc\ufffdd",
   385  		nSrc:    8,
   386  	}, {
   387  		desc:    "invalid fourth byte, tight buffer",
   388  		src:     "a\u0300bc\xf0\x90\x80d",
   389  		sizeDst: 9,
   390  		want:    "a\u0300bc\ufffdd",
   391  		nSrc:    9,
   392  	}, {
   393  		desc:    "invalid fourth byte at end",
   394  		src:     "a\u0300bc\xf0\x90\x80",
   395  		sizeDst: 8,
   396  		want:    "a\u0300bc\ufffd",
   397  		nSrc:    8,
   398  	}, {
   399  		desc:    "invalid fourth byte and short four byte sequence",
   400  		src:     "a\u0300bc\xf0\x90\x80\xf0\x90\x80",
   401  		notEOF:  true,
   402  		sizeDst: 20,
   403  		want:    "a\u0300bc\ufffd",
   404  		nSrc:    8,
   405  		err:     transform.ErrShortSrc,
   406  	}, {
   407  		desc:    "valid four-byte sequence overflowing short buffer",
   408  		src:     "a\u0300bc\xf0\x90\x80\x80",
   409  		notEOF:  true,
   410  		sizeDst: 8,
   411  		want:    "a\u0300bc",
   412  		nSrc:    5,
   413  		err:     transform.ErrShortDst,
   414  	}, {
   415  		desc:    "invalid fourth byte at end short, but short dst",
   416  		src:     "a\u0300bc\xf0\x90\x80\xf0\x90\x80",
   417  		notEOF:  true,
   418  		sizeDst: 8,
   419  		// More bytes would fit in the buffer, but this seems to require a more
   420  		// complicated and slower algorithm.
   421  		want: "a\u0300bc", // instead of "a\u0300bc"
   422  		nSrc: 5,
   423  		err:  transform.ErrShortDst,
   424  	}, {
   425  		desc:    "short dst for error",
   426  		src:     "abc\x80",
   427  		notEOF:  true,
   428  		sizeDst: 5,
   429  		want:    "abc",
   430  		nSrc:    3,
   431  		err:     transform.ErrShortDst,
   432  	}, {
   433  		desc:    "adjusting short dst buffer",
   434  		src:     "abc\x80ef",
   435  		notEOF:  true,
   436  		sizeDst: 6,
   437  		want:    "abc\ufffd",
   438  		nSrc:    4,
   439  		err:     transform.ErrShortDst,
   440  	}}
   441  	tr := UTF8.NewDecoder()
   442  	for i, tc := range testCases {
   443  		b := make([]byte, tc.sizeDst)
   444  		nDst, nSrc, err := tr.Transform(b, []byte(tc.src), !tc.notEOF)
   445  		if err != tc.err {
   446  			t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
   447  		}
   448  		if got := string(b[:nDst]); got != tc.want {
   449  			t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
   450  		}
   451  		if nSrc != tc.nSrc {
   452  			t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
   453  		}
   454  	}
   455  }
   456  
   457  func TestBOMOverride(t *testing.T) {
   458  	dec := BOMOverride(charmap.CodePage437.NewDecoder())
   459  	dst := make([]byte, 100)
   460  	for i, tc := range []struct {
   461  		src   string
   462  		atEOF bool
   463  		dst   string
   464  		nSrc  int
   465  		err   error
   466  	}{
   467  		0:  {"H\x82ll\x93", true, "Héllô", 5, nil},
   468  		1:  {"\uFEFFHéllö", true, "Héllö", 10, nil},
   469  		2:  {"\xFE\xFF\x00H\x00e\x00l\x00l\x00o", true, "Hello", 12, nil},
   470  		3:  {"\xFF\xFEH\x00e\x00l\x00l\x00o\x00", true, "Hello", 12, nil},
   471  		4:  {"\uFEFF", true, "", 3, nil},
   472  		5:  {"\xFE\xFF", true, "", 2, nil},
   473  		6:  {"\xFF\xFE", true, "", 2, nil},
   474  		7:  {"\xEF\xBB", true, "\u2229\u2557", 2, nil},
   475  		8:  {"\xEF", true, "\u2229", 1, nil},
   476  		9:  {"", true, "", 0, nil},
   477  		10: {"\xFE", true, "\u25a0", 1, nil},
   478  		11: {"\xFF", true, "\u00a0", 1, nil},
   479  		12: {"\xEF\xBB", false, "", 0, transform.ErrShortSrc},
   480  		13: {"\xEF", false, "", 0, transform.ErrShortSrc},
   481  		14: {"", false, "", 0, transform.ErrShortSrc},
   482  		15: {"\xFE", false, "", 0, transform.ErrShortSrc},
   483  		16: {"\xFF", false, "", 0, transform.ErrShortSrc},
   484  		17: {"\xFF\xFE", false, "", 0, transform.ErrShortSrc},
   485  	} {
   486  		dec.Reset()
   487  		nDst, nSrc, err := dec.Transform(dst, []byte(tc.src), tc.atEOF)
   488  		got := string(dst[:nDst])
   489  		if nSrc != tc.nSrc {
   490  			t.Errorf("%d: nSrc: got %d; want %d", i, nSrc, tc.nSrc)
   491  		}
   492  		if got != tc.dst {
   493  			t.Errorf("%d: got %+q; want %+q", i, got, tc.dst)
   494  		}
   495  		if err != tc.err {
   496  			t.Errorf("%d: error: got %v; want %v", i, err, tc.err)
   497  		}
   498  	}
   499  }