github.com/NeowayLabs/nash@v0.2.2-0.20200127205349-a227041ffd50/stdbin/strings/strings_test.go (about)

     1  package main_test
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  	"testing"
    10  
    11  	strings "github.com/madlambda/nash/stdbin/strings"
    12  )
    13  
    14  func TestStrings(t *testing.T) {
    15  
    16  	type testcase struct {
    17  		name        string
    18  		input       func([]byte) []byte
    19  		output      []string
    20  		minWordSize uint
    21  	}
    22  
    23  	tcases := []testcase{
    24  		{
    25  			name:        "UTF-8With2Bytes",
    26  			minWordSize: 1,
    27  			input: func(bin []byte) []byte {
    28  				return append([]byte("λ"), bin...)
    29  			},
    30  			output: []string{"λ"},
    31  		},
    32  		{
    33  			name:        "UTF-8With3Bytes",
    34  			minWordSize: 1,
    35  			input: func(bin []byte) []byte {
    36  				return append([]byte("€"), bin...)
    37  			},
    38  			output: []string{"€"},
    39  		},
    40  		{
    41  			name:        "UTF-8With4Bytes",
    42  			minWordSize: 1,
    43  			input: func(bin []byte) []byte {
    44  				return append([]byte("𐍈"), bin...)
    45  			},
    46  			output: []string{"𐍈"},
    47  		},
    48  		{
    49  			name:        "NonASCIIWordHasOneLessCharThanMin",
    50  			minWordSize: 2,
    51  			input: func(bin []byte) []byte {
    52  				return append([]byte("λ"), bin...)
    53  			},
    54  			output: []string{},
    55  		},
    56  		{
    57  			name:        "NonASCIIWordHasMinWordSize",
    58  			minWordSize: 2,
    59  			input: func(bin []byte) []byte {
    60  				return append([]byte("λλ"), bin...)
    61  			},
    62  			output: []string{"λλ"},
    63  		},
    64  		{
    65  			name:        "WordHasOneLessCharThanMin",
    66  			minWordSize: 2,
    67  			input: func(bin []byte) []byte {
    68  				return append([]byte("k"), bin...)
    69  			},
    70  			output: []string{},
    71  		},
    72  		{
    73  			name:        "WordHasMinWordSize",
    74  			minWordSize: 2,
    75  			input: func(bin []byte) []byte {
    76  				return append([]byte("kz"), bin...)
    77  			},
    78  			output: []string{"kz"},
    79  		},
    80  		{
    81  			name:        "WordHasOneMoreCharThanMinWordSize",
    82  			minWordSize: 2,
    83  			input: func(bin []byte) []byte {
    84  				return append([]byte("ktz"), bin...)
    85  			},
    86  			output: []string{"ktz"},
    87  		},
    88  		{
    89  			name:        "StartingWithOneChar",
    90  			minWordSize: 1,
    91  			input: func(bin []byte) []byte {
    92  				return append([]byte("k"), bin...)
    93  			},
    94  			output: []string{"k"},
    95  		},
    96  		{
    97  			name:        "EndWithOneChar",
    98  			minWordSize: 1,
    99  			input: func(bin []byte) []byte {
   100  				return append(bin, []byte("k")...)
   101  			},
   102  			output: []string{"k"},
   103  		},
   104  		{
   105  			name:        "OneCharInTheMiddle",
   106  			minWordSize: 1,
   107  			input: func(bin []byte) []byte {
   108  				t := append(bin, []byte("k")...)
   109  				t = append(t, bin...)
   110  				return t
   111  			},
   112  			output: []string{"k"},
   113  		},
   114  		{
   115  			name:        "StartingWithText",
   116  			minWordSize: 1,
   117  			input: func(bin []byte) []byte {
   118  				expected := "textOnBeggining"
   119  				return append([]byte(expected), bin...)
   120  			},
   121  			output: []string{"textOnBeggining"},
   122  		},
   123  		{
   124  			name:        "TextOnMiddle",
   125  			minWordSize: 1,
   126  			input: func(bin []byte) []byte {
   127  				expected := "textOnMiddle"
   128  				return append(bin, append([]byte(expected), bin...)...)
   129  			},
   130  			output: []string{"textOnMiddle"},
   131  		},
   132  		{
   133  			name:        "NonASCIITextOnMiddle",
   134  			minWordSize: 1,
   135  			input: func(bin []byte) []byte {
   136  				expected := "λλλ"
   137  				return append(bin, append([]byte(expected), bin...)...)
   138  			},
   139  			output: []string{"λλλ"},
   140  		},
   141  		{
   142  			name:        "ASCIIAndNonASCII",
   143  			minWordSize: 1,
   144  			input: func(bin []byte) []byte {
   145  				expected := "(define (λ (x) (+ x a)))"
   146  				return append(bin, append([]byte(expected), bin...)...)
   147  			},
   148  			output: []string{"(define (λ (x) (+ x a)))"},
   149  		},
   150  		{
   151  			name:        "TextOnEnd",
   152  			minWordSize: 1,
   153  			input: func(bin []byte) []byte {
   154  				expected := "textOnEnd"
   155  				return append(bin, append([]byte(expected), bin...)...)
   156  			},
   157  			output: []string{"textOnEnd"},
   158  		},
   159  		{
   160  			name:        "JustText",
   161  			minWordSize: 1,
   162  			input: func(bin []byte) []byte {
   163  				return []byte("justtext")
   164  			},
   165  			output: []string{"justtext"},
   166  		},
   167  		{
   168  			name:        "JustBinary",
   169  			minWordSize: 1,
   170  			input: func(bin []byte) []byte {
   171  				return bin
   172  			},
   173  			output: []string{},
   174  		},
   175  		{
   176  			name:        "TextSeparatedByBinary",
   177  			minWordSize: 1,
   178  			input: func(bin []byte) []byte {
   179  				text := []byte("text")
   180  				t := []byte{}
   181  				t = append(t, bin...)
   182  				t = append(t, text...)
   183  				t = append(t, bin...)
   184  				t = append(t, text...)
   185  				return t
   186  			},
   187  			output: []string{"text", "text"},
   188  		},
   189  		{
   190  			name:        "NonASCIITextSeparatedByBinary",
   191  			minWordSize: 1,
   192  			input: func(bin []byte) []byte {
   193  				text := []byte("awesomeλ=)")
   194  				t := []byte{}
   195  				t = append(t, bin...)
   196  				t = append(t, text...)
   197  				t = append(t, bin...)
   198  				t = append(t, text...)
   199  				return t
   200  			},
   201  			output: []string{"awesomeλ=)", "awesomeλ=)"},
   202  		},
   203  		{
   204  			name:        "WordsAreNotAccumulativeBetweenBinData",
   205  			minWordSize: 2,
   206  			input: func(bin []byte) []byte {
   207  				t := append([]byte("k"), bin...)
   208  				return append(t, byte('t'))
   209  			},
   210  			output: []string{},
   211  		},
   212  		{
   213  			name:        "ASCIISeparatedByByteThatLooksLikeUTF",
   214  			minWordSize: 1,
   215  			input: func(bin []byte) []byte {
   216  				return append([]byte{
   217  					'n',
   218  					runestart,
   219  					'k',
   220  				}, bin...)
   221  			},
   222  			output: []string{"n", "k"},
   223  		},
   224  		{
   225  			name:        "ASCIIAfterPossibleFirstByteOfUTF",
   226  			minWordSize: 1,
   227  			input: func(bin []byte) []byte {
   228  				return append([]byte{
   229  					runestart,
   230  					'k',
   231  				}, bin...)
   232  			},
   233  			output: []string{"k"},
   234  		},
   235  		{
   236  			name:        "ASCIIAfterPossibleSecondByteOfUTF",
   237  			minWordSize: 1,
   238  			input: func(bin []byte) []byte {
   239  				return append([]byte{
   240  					byte(0xE2),
   241  					byte(0x82),
   242  					'k',
   243  				}, bin...)
   244  			},
   245  			output: []string{"k"},
   246  		},
   247  		{
   248  			name:        "ASCIIAfterPossibleThirdByteOfUTF",
   249  			minWordSize: 1,
   250  			input: func(bin []byte) []byte {
   251  				return append([]byte{
   252  					byte(0xF0),
   253  					byte(0x90),
   254  					byte(0x8D),
   255  					'k',
   256  				}, bin...)
   257  			},
   258  			output: []string{"k"},
   259  		},
   260  		{
   261  			name:        "AfterFalseRuneStartRuneStartOnSecondByte",
   262  			minWordSize: 1,
   263  			input: func(bin []byte) []byte {
   264  				i := []byte{byte(0xF0)}
   265  				i = append(i, []byte("λ")...)
   266  				return append(i, bin...)
   267  			},
   268  			output: []string{"λ"},
   269  		},
   270  		{
   271  			name:        "AfterFalseRuneStartRuneStartOnThirdByte",
   272  			minWordSize: 1,
   273  			input: func(bin []byte) []byte {
   274  				i := []byte{byte(0xF0), byte(0x90)}
   275  				i = append(i, []byte("λ")...)
   276  				return append(i, bin...)
   277  			},
   278  			output: []string{"λ"},
   279  		},
   280  		{
   281  			name:        "AfterFalseRuneStartRuneStartOnFourthByte",
   282  			minWordSize: 1,
   283  			input: func(bin []byte) []byte {
   284  				i := []byte{byte(0xF0), byte(0x90), byte(0x8D)}
   285  				i = append(i, []byte("λ")...)
   286  				return append(i, bin...)
   287  			},
   288  			output: []string{"λ"},
   289  		},
   290  		{
   291  			name:        "ASCIIFakeRuneAndThemRune",
   292  			minWordSize: 1,
   293  			input: func(bin []byte) []byte {
   294  				i := []byte{'v'}
   295  				i = append(i, byte(0xF0))
   296  				i = append(i, []byte("λ")...)
   297  				return append(i, bin...)
   298  			},
   299  			output: []string{"v", "λ"},
   300  		},
   301  		{
   302  			name:        "ASCIISplittedByZero",
   303  			minWordSize: 1,
   304  			input: func([]byte) []byte {
   305  				return []byte{'k', 0, 'n', 0, 'v'}
   306  			},
   307  			output: []string{"k", "n", "v"},
   308  		},
   309  		{
   310  			name:        "RunesSplittedByZero",
   311  			minWordSize: 1,
   312  			input: func([]byte) []byte {
   313  				i := []byte("λ")
   314  				i = append(i, 0)
   315  				i = append(i, []byte("λ")...)
   316  				return i
   317  			},
   318  			output: []string{"λ", "λ"},
   319  		},
   320  		{
   321  			name:        "ASCIIAndRunesSplittedByZero",
   322  			minWordSize: 1,
   323  			input: func([]byte) []byte {
   324  				i := []byte("λ")
   325  				i = append(i, 0)
   326  				i = append(i, 's')
   327  				i = append(i, 0)
   328  				i = append(i, []byte("λ")...)
   329  				return i
   330  			},
   331  			output: []string{"λ", "s", "λ"},
   332  		},
   333  	}
   334  
   335  	minBinChunkSize := 1
   336  	maxBinChunkSize := 128
   337  
   338  	for _, tcase := range tcases {
   339  		for i := minBinChunkSize; i <= maxBinChunkSize; i++ {
   340  			binsize := i
   341  			testname := fmt.Sprintf("%s/binSize%d", tcase.name, binsize)
   342  			t.Run(testname, func(t *testing.T) {
   343  				bin := newBinary(uint(binsize))
   344  				input := tcase.input(bin)
   345  				scanner := strings.Do(bytes.NewBuffer(input), tcase.minWordSize)
   346  
   347  				lines := []string{}
   348  				for scanner.Scan() {
   349  					lines = append(lines, scanner.Text())
   350  				}
   351  
   352  				if len(lines) != len(tcase.output) {
   353  					t.Errorf("wanted size[%d] got size[%d]", len(tcase.output), len(lines))
   354  					t.Fatalf("wanted[%s] got[%s]", tcase.output, lines)
   355  				}
   356  
   357  				for i, want := range tcase.output {
   358  					got := lines[i]
   359  					if want != got {
   360  						t.Errorf("unexpected line at[%d]", i)
   361  						t.Errorf("wanted[%s] got[%s]", want, got)
   362  						t.Errorf("wantedLines[%s] gotLines[%s]", tcase.output, lines)
   363  					}
   364  				}
   365  
   366  				if scanner.Err() != nil {
   367  					t.Fatalf("unexpected error[%s]", scanner.Err())
   368  				}
   369  			})
   370  		}
   371  	}
   372  }
   373  
   374  func TestStringsReadErrorOnFirstByte(t *testing.T) {
   375  	var minWordSize uint = 1
   376  	scanner := strings.Do(newFakeReader(func(d []byte) (int, error) {
   377  		return 0, errors.New("fake injected error")
   378  	}), minWordSize)
   379  	assertScannerFails(t, scanner, 0)
   380  }
   381  
   382  func TestStringsReadErrorOnSecondByte(t *testing.T) {
   383  	var minWordSize uint = 1
   384  	sentFirstByte := false
   385  	scanner := strings.Do(newFakeReader(func(d []byte) (int, error) {
   386  		if sentFirstByte {
   387  			return 0, errors.New("fake injected error")
   388  		}
   389  		d[0] = 'k'
   390  		sentFirstByte = true
   391  		return 1, nil
   392  	}), minWordSize)
   393  	assertScannerFails(t, scanner, 1)
   394  }
   395  
   396  func TestStringsReadErrorAfterValidUTF8StartingByte(t *testing.T) {
   397  	var minWordSize uint = 1
   398  	sentFirstByte := false
   399  	scanner := strings.Do(newFakeReader(func(d []byte) (int, error) {
   400  		if sentFirstByte {
   401  			return 0, errors.New("fake injected error")
   402  		}
   403  		sentFirstByte = true
   404  		d[0] = runestart
   405  		return 1, nil
   406  	}), minWordSize)
   407  	assertScannerFails(t, scanner, 0)
   408  }
   409  
   410  func TestStringsReadCanReturnEOFWithData(t *testing.T) {
   411  	var minWordSize uint = 1
   412  	want := byte('k')
   413  
   414  	scanner := strings.Do(newFakeReader(func(d []byte) (int, error) {
   415  		if len(d) == 0 {
   416  			t.Fatal("empty data on Read operation")
   417  		}
   418  		d[0] = want
   419  		return 1, io.EOF
   420  	}), minWordSize)
   421  
   422  	if !scanner.Scan() {
   423  		t.Fatal("unexpected Scan failure")
   424  	}
   425  	got := scanner.Text()
   426  	if string(want) != got {
   427  		t.Fatalf("want[%s] != got[%s]", string(want), got)
   428  	}
   429  }
   430  
   431  const runestart byte = 0xC2
   432  
   433  type FakeReader struct {
   434  	read func([]byte) (int, error)
   435  }
   436  
   437  func (f *FakeReader) Read(d []byte) (int, error) {
   438  	if f.read == nil {
   439  		return 0, fmt.Errorf("FakeReader has no Read implementation")
   440  	}
   441  	return f.read(d)
   442  }
   443  
   444  func newFakeReader(read func([]byte) (int, error)) *FakeReader {
   445  	return &FakeReader{read: read}
   446  }
   447  
   448  func assertScannerFails(t *testing.T, scanner *bufio.Scanner, expectedIter uint) {
   449  	var iterations uint
   450  	for scanner.Scan() {
   451  		iterations += 1
   452  	}
   453  
   454  	if iterations != expectedIter {
   455  		t.Fatalf("expected[%d] Scan calls, got [%d]", expectedIter, iterations)
   456  	}
   457  
   458  	if scanner.Err() == nil {
   459  		t.Fatal("expected failure on scanner, got none")
   460  	}
   461  }
   462  
   463  func newBinary(size uint) []byte {
   464  	// WHY: Starting with the most significant bit as 1 helps to test
   465  	// UTF-8 corner cases. Don't change this without providing
   466  	// testing for this. Not the best way to do this (not explicit)
   467  	// but it is what we have for today =).
   468  	bin := make([]byte, size)
   469  	for i := 0; i < int(size); i++ {
   470  		bin[i] = 0xFF
   471  	}
   472  	return bin
   473  }