github.com/pgavlin/text@v0.0.0-20240419000839-8438d0a47805/replace_test.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package text_test
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"testing"
    11  
    12  	. "github.com/pgavlin/text"
    13  )
    14  
    15  var htmlEscaper = NewReplacer(
    16  	"&", "&",
    17  	"<", "&lt;",
    18  	">", "&gt;",
    19  	`"`, "&quot;",
    20  	"'", "&apos;",
    21  )
    22  
    23  var htmlUnescaper = NewReplacer(
    24  	"&amp;", "&",
    25  	"&lt;", "<",
    26  	"&gt;", ">",
    27  	"&quot;", `"`,
    28  	"&apos;", "'",
    29  )
    30  
    31  // The http package's old HTML escaping function.
    32  func oldHTMLEscape(s string) string {
    33  	s = Replace(s, "&", "&amp;", -1)
    34  	s = Replace(s, "<", "&lt;", -1)
    35  	s = Replace(s, ">", "&gt;", -1)
    36  	s = Replace(s, `"`, "&quot;", -1)
    37  	s = Replace(s, "'", "&apos;", -1)
    38  	return s
    39  }
    40  
    41  var capitalLetters = NewReplacer("a", "A", "b", "B")
    42  
    43  // TestReplacer tests the replacer implementations.
    44  func TestReplacer(t *testing.T) {
    45  	type testCase struct {
    46  		r       *Replacer[string]
    47  		in, out string
    48  	}
    49  	var testCases []testCase
    50  
    51  	// str converts 0xff to "\xff". This isn't just string(b) since that converts to UTF-8.
    52  	str := func(b byte) string {
    53  		return string([]byte{b})
    54  	}
    55  	var s []string
    56  
    57  	// inc maps "\x00"->"\x01", ..., "a"->"b", "b"->"c", ..., "\xff"->"\x00".
    58  	s = nil
    59  	for i := 0; i < 256; i++ {
    60  		s = append(s, str(byte(i)), str(byte(i+1)))
    61  	}
    62  	inc := NewReplacer(s...)
    63  
    64  	// Test cases with 1-byte old strings, 1-byte new strings.
    65  	testCases = append(testCases,
    66  		testCase{capitalLetters, "brad", "BrAd"},
    67  		testCase{capitalLetters, Repeat("a", (32<<10)+123), Repeat("A", (32<<10)+123)},
    68  		testCase{capitalLetters, "", ""},
    69  
    70  		testCase{inc, "brad", "csbe"},
    71  		testCase{inc, "\x00\xff", "\x01\x00"},
    72  		testCase{inc, "", ""},
    73  
    74  		testCase{NewReplacer("a", "1", "a", "2"), "brad", "br1d"},
    75  	)
    76  
    77  	// repeat maps "a"->"a", "b"->"bb", "c"->"ccc", ...
    78  	s = nil
    79  	for i := 0; i < 256; i++ {
    80  		n := i + 1 - 'a'
    81  		if n < 1 {
    82  			n = 1
    83  		}
    84  		s = append(s, str(byte(i)), Repeat(str(byte(i)), n))
    85  	}
    86  	repeat := NewReplacer(s...)
    87  
    88  	// Test cases with 1-byte old strings, variable length new strings.
    89  	testCases = append(testCases,
    90  		testCase{htmlEscaper, "No changes", "No changes"},
    91  		testCase{htmlEscaper, "I <3 escaping & stuff", "I &lt;3 escaping &amp; stuff"},
    92  		testCase{htmlEscaper, "&&&", "&amp;&amp;&amp;"},
    93  		testCase{htmlEscaper, "", ""},
    94  
    95  		testCase{repeat, "brad", "bbrrrrrrrrrrrrrrrrrradddd"},
    96  		testCase{repeat, "abba", "abbbba"},
    97  		testCase{repeat, "", ""},
    98  
    99  		testCase{NewReplacer("a", "11", "a", "22"), "brad", "br11d"},
   100  	)
   101  
   102  	// The remaining test cases have variable length old strings.
   103  
   104  	testCases = append(testCases,
   105  		testCase{htmlUnescaper, "&amp;amp;", "&amp;"},
   106  		testCase{htmlUnescaper, "&lt;b&gt;HTML&apos;s neat&lt;/b&gt;", "<b>HTML's neat</b>"},
   107  		testCase{htmlUnescaper, "", ""},
   108  
   109  		testCase{NewReplacer("a", "1", "a", "2", "xxx", "xxx"), "brad", "br1d"},
   110  
   111  		testCase{NewReplacer("a", "1", "aa", "2", "aaa", "3"), "aaaa", "1111"},
   112  
   113  		testCase{NewReplacer("aaa", "3", "aa", "2", "a", "1"), "aaaa", "31"},
   114  	)
   115  
   116  	// gen1 has multiple old strings of variable length. There is no
   117  	// overall non-empty common prefix, but some pairwise common prefixes.
   118  	gen1 := NewReplacer(
   119  		"aaa", "3[aaa]",
   120  		"aa", "2[aa]",
   121  		"a", "1[a]",
   122  		"i", "i",
   123  		"longerst", "most long",
   124  		"longer", "medium",
   125  		"long", "short",
   126  		"xx", "xx",
   127  		"x", "X",
   128  		"X", "Y",
   129  		"Y", "Z",
   130  	)
   131  	testCases = append(testCases,
   132  		testCase{gen1, "fooaaabar", "foo3[aaa]b1[a]r"},
   133  		testCase{gen1, "long, longerst, longer", "short, most long, medium"},
   134  		testCase{gen1, "xxxxx", "xxxxX"},
   135  		testCase{gen1, "XiX", "YiY"},
   136  		testCase{gen1, "", ""},
   137  	)
   138  
   139  	// gen2 has multiple old strings with no pairwise common prefix.
   140  	gen2 := NewReplacer(
   141  		"roses", "red",
   142  		"violets", "blue",
   143  		"sugar", "sweet",
   144  	)
   145  	testCases = append(testCases,
   146  		testCase{gen2, "roses are red, violets are blue...", "red are red, blue are blue..."},
   147  		testCase{gen2, "", ""},
   148  	)
   149  
   150  	// gen3 has multiple old strings with an overall common prefix.
   151  	gen3 := NewReplacer(
   152  		"abracadabra", "poof",
   153  		"abracadabrakazam", "splat",
   154  		"abraham", "lincoln",
   155  		"abrasion", "scrape",
   156  		"abraham", "isaac",
   157  	)
   158  	testCases = append(testCases,
   159  		testCase{gen3, "abracadabrakazam abraham", "poofkazam lincoln"},
   160  		testCase{gen3, "abrasion abracad", "scrape abracad"},
   161  		testCase{gen3, "abba abram abrasive", "abba abram abrasive"},
   162  		testCase{gen3, "", ""},
   163  	)
   164  
   165  	// foo{1,2,3,4} have multiple old strings with an overall common prefix
   166  	// and 1- or 2- byte extensions from the common prefix.
   167  	foo1 := NewReplacer(
   168  		"foo1", "A",
   169  		"foo2", "B",
   170  		"foo3", "C",
   171  	)
   172  	foo2 := NewReplacer(
   173  		"foo1", "A",
   174  		"foo2", "B",
   175  		"foo31", "C",
   176  		"foo32", "D",
   177  	)
   178  	foo3 := NewReplacer(
   179  		"foo11", "A",
   180  		"foo12", "B",
   181  		"foo31", "C",
   182  		"foo32", "D",
   183  	)
   184  	foo4 := NewReplacer(
   185  		"foo12", "B",
   186  		"foo32", "D",
   187  	)
   188  	testCases = append(testCases,
   189  		testCase{foo1, "fofoofoo12foo32oo", "fofooA2C2oo"},
   190  		testCase{foo1, "", ""},
   191  
   192  		testCase{foo2, "fofoofoo12foo32oo", "fofooA2Doo"},
   193  		testCase{foo2, "", ""},
   194  
   195  		testCase{foo3, "fofoofoo12foo32oo", "fofooBDoo"},
   196  		testCase{foo3, "", ""},
   197  
   198  		testCase{foo4, "fofoofoo12foo32oo", "fofooBDoo"},
   199  		testCase{foo4, "", ""},
   200  	)
   201  
   202  	// genAll maps "\x00\x01\x02...\xfe\xff" to "[all]", amongst other things.
   203  	allBytes := make([]byte, 256)
   204  	for i := range allBytes {
   205  		allBytes[i] = byte(i)
   206  	}
   207  	allString := string(allBytes)
   208  	genAll := NewReplacer(
   209  		allString, "[all]",
   210  		"\xff", "[ff]",
   211  		"\x00", "[00]",
   212  	)
   213  	testCases = append(testCases,
   214  		testCase{genAll, allString, "[all]"},
   215  		testCase{genAll, "a\xff" + allString + "\x00", "a[ff][all][00]"},
   216  		testCase{genAll, "", ""},
   217  	)
   218  
   219  	// Test cases with empty old strings.
   220  
   221  	blankToX1 := NewReplacer("", "X")
   222  	blankToX2 := NewReplacer("", "X", "", "")
   223  	blankHighPriority := NewReplacer("", "X", "o", "O")
   224  	blankLowPriority := NewReplacer("o", "O", "", "X")
   225  	blankNoOp1 := NewReplacer("", "")
   226  	blankNoOp2 := NewReplacer("", "", "", "A")
   227  	blankFoo := NewReplacer("", "X", "foobar", "R", "foobaz", "Z")
   228  	testCases = append(testCases,
   229  		testCase{blankToX1, "foo", "XfXoXoX"},
   230  		testCase{blankToX1, "", "X"},
   231  
   232  		testCase{blankToX2, "foo", "XfXoXoX"},
   233  		testCase{blankToX2, "", "X"},
   234  
   235  		testCase{blankHighPriority, "oo", "XOXOX"},
   236  		testCase{blankHighPriority, "ii", "XiXiX"},
   237  		testCase{blankHighPriority, "oiio", "XOXiXiXOX"},
   238  		testCase{blankHighPriority, "iooi", "XiXOXOXiX"},
   239  		testCase{blankHighPriority, "", "X"},
   240  
   241  		testCase{blankLowPriority, "oo", "OOX"},
   242  		testCase{blankLowPriority, "ii", "XiXiX"},
   243  		testCase{blankLowPriority, "oiio", "OXiXiOX"},
   244  		testCase{blankLowPriority, "iooi", "XiOOXiX"},
   245  		testCase{blankLowPriority, "", "X"},
   246  
   247  		testCase{blankNoOp1, "foo", "foo"},
   248  		testCase{blankNoOp1, "", ""},
   249  
   250  		testCase{blankNoOp2, "foo", "foo"},
   251  		testCase{blankNoOp2, "", ""},
   252  
   253  		testCase{blankFoo, "foobarfoobaz", "XRXZX"},
   254  		testCase{blankFoo, "foobar-foobaz", "XRX-XZX"},
   255  		testCase{blankFoo, "", "X"},
   256  	)
   257  
   258  	// single string replacer
   259  
   260  	abcMatcher := NewReplacer("abc", "[match]")
   261  
   262  	testCases = append(testCases,
   263  		testCase{abcMatcher, "", ""},
   264  		testCase{abcMatcher, "ab", "ab"},
   265  		testCase{abcMatcher, "abc", "[match]"},
   266  		testCase{abcMatcher, "abcd", "[match]d"},
   267  		testCase{abcMatcher, "cabcabcdabca", "c[match][match]d[match]a"},
   268  	)
   269  
   270  	// Issue 6659 cases (more single string replacer)
   271  
   272  	noHello := NewReplacer("Hello", "")
   273  	testCases = append(testCases,
   274  		testCase{noHello, "Hello", ""},
   275  		testCase{noHello, "Hellox", "x"},
   276  		testCase{noHello, "xHello", "x"},
   277  		testCase{noHello, "xHellox", "xx"},
   278  	)
   279  
   280  	// No-arg test cases.
   281  
   282  	nop := NewReplacer[string]()
   283  	testCases = append(testCases,
   284  		testCase{nop, "abc", "abc"},
   285  		testCase{nop, "", ""},
   286  	)
   287  
   288  	// Run the test cases.
   289  
   290  	for i, tc := range testCases {
   291  		if s := tc.r.Replace(tc.in); s != tc.out {
   292  			t.Errorf("%d. Replace(%q) = %q, want %q", i, tc.in, s, tc.out)
   293  		}
   294  		var buf bytes.Buffer
   295  		n, err := tc.r.WriteString(&buf, tc.in)
   296  		if err != nil {
   297  			t.Errorf("%d. WriteString: %v", i, err)
   298  			continue
   299  		}
   300  		got := buf.String()
   301  		if got != tc.out {
   302  			t.Errorf("%d. WriteString(%q) wrote %q, want %q", i, tc.in, got, tc.out)
   303  			continue
   304  		}
   305  		if n != len(tc.out) {
   306  			t.Errorf("%d. WriteString(%q) wrote correct string but reported %d bytes; want %d (%q)",
   307  				i, tc.in, n, len(tc.out), tc.out)
   308  		}
   309  	}
   310  }
   311  
   312  var algorithmTestCases = []struct {
   313  	r    *Replacer[string]
   314  	want string
   315  }{
   316  	{capitalLetters, "*text.byteReplacer[string]"},
   317  	{htmlEscaper, "*text.byteStringReplacer[string]"},
   318  	{NewReplacer("12", "123"), "*text.singleStringReplacer[string]"},
   319  	{NewReplacer("1", "12"), "*text.byteStringReplacer[string]"},
   320  	{NewReplacer("", "X"), "*text.genericReplacer[string]"},
   321  	{NewReplacer("a", "1", "b", "12", "cde", "123"), "*text.genericReplacer[string]"},
   322  }
   323  
   324  // TestPickAlgorithm tests that NewReplacer picks the correct algorithm.
   325  func TestPickAlgorithm(t *testing.T) {
   326  	for i, tc := range algorithmTestCases {
   327  		got := fmt.Sprintf("%T", tc.r.Replacer())
   328  		if got != tc.want {
   329  			t.Errorf("%d. algorithm = %s, want %s", i, got, tc.want)
   330  		}
   331  	}
   332  }
   333  
   334  type errWriter struct{}
   335  
   336  func (errWriter) Write(p []byte) (n int, err error) {
   337  	return 0, fmt.Errorf("unwritable")
   338  }
   339  
   340  // TestWriteStringError tests that WriteString returns an error
   341  // received from the underlying io.Writer.
   342  func TestWriteStringError(t *testing.T) {
   343  	for i, tc := range algorithmTestCases {
   344  		n, err := tc.r.WriteString(errWriter{}, "abc")
   345  		if n != 0 || err == nil || err.Error() != "unwritable" {
   346  			t.Errorf("%d. WriteStringError = %d, %v, want 0, unwritable", i, n, err)
   347  		}
   348  	}
   349  }
   350  
   351  // TestGenericTrieBuilding verifies the structure of the generated trie. There
   352  // is one node per line, and the key ending with the current line is in the
   353  // trie if it ends with a "+".
   354  func TestGenericTrieBuilding(t *testing.T) {
   355  	testCases := []struct{ in, out string }{
   356  		{"abc;abdef;abdefgh;xx;xy;z", `-
   357  			a-
   358  			.b-
   359  			..c+
   360  			..d-
   361  			...ef+
   362  			.....gh+
   363  			x-
   364  			.x+
   365  			.y+
   366  			z+
   367  			`},
   368  		{"abracadabra;abracadabrakazam;abraham;abrasion", `-
   369  			a-
   370  			.bra-
   371  			....c-
   372  			.....adabra+
   373  			...........kazam+
   374  			....h-
   375  			.....am+
   376  			....s-
   377  			.....ion+
   378  			`},
   379  		{"aaa;aa;a;i;longerst;longer;long;xx;x;X;Y", `-
   380  			X+
   381  			Y+
   382  			a+
   383  			.a+
   384  			..a+
   385  			i+
   386  			l-
   387  			.ong+
   388  			....er+
   389  			......st+
   390  			x+
   391  			.x+
   392  			`},
   393  		{"foo;;foo;foo1", `+
   394  			f-
   395  			.oo+
   396  			...1+
   397  			`},
   398  	}
   399  
   400  	for _, tc := range testCases {
   401  		keys := Split(tc.in, ";")
   402  		args := make([]string, len(keys)*2)
   403  		for i, key := range keys {
   404  			args[i*2] = key
   405  		}
   406  
   407  		got := NewReplacer(args...).PrintTrie()
   408  		// Remove tabs from tc.out
   409  		wantbuf := make([]byte, 0, len(tc.out))
   410  		for i := 0; i < len(tc.out); i++ {
   411  			if tc.out[i] != '\t' {
   412  				wantbuf = append(wantbuf, tc.out[i])
   413  			}
   414  		}
   415  		want := string(wantbuf)
   416  
   417  		if got != want {
   418  			t.Errorf("PrintTrie(%q)\ngot\n%swant\n%s", tc.in, got, want)
   419  		}
   420  	}
   421  }
   422  
   423  func BenchmarkGenericNoMatch(b *testing.B) {
   424  	str := Repeat("A", 100) + Repeat("B", 100)
   425  	generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic
   426  	for i := 0; i < b.N; i++ {
   427  		generic.Replace(str)
   428  	}
   429  }
   430  
   431  func BenchmarkGenericMatch1(b *testing.B) {
   432  	str := Repeat("a", 100) + Repeat("b", 100)
   433  	generic := NewReplacer("a", "A", "b", "B", "12", "123")
   434  	for i := 0; i < b.N; i++ {
   435  		generic.Replace(str)
   436  	}
   437  }
   438  
   439  func BenchmarkGenericMatch2(b *testing.B) {
   440  	str := Repeat("It&apos;s &lt;b&gt;HTML&lt;/b&gt;!", 100)
   441  	for i := 0; i < b.N; i++ {
   442  		htmlUnescaper.Replace(str)
   443  	}
   444  }
   445  
   446  func benchmarkSingleString(b *testing.B, pattern, text string) {
   447  	r := NewReplacer(pattern, "[match]")
   448  	b.SetBytes(int64(len(text)))
   449  	b.ResetTimer()
   450  	for i := 0; i < b.N; i++ {
   451  		r.Replace(text)
   452  	}
   453  }
   454  
   455  func BenchmarkSingleMaxSkipping(b *testing.B) {
   456  	benchmarkSingleString(b, Repeat("b", 25), Repeat("a", 10000))
   457  }
   458  
   459  func BenchmarkSingleLongSuffixFail(b *testing.B) {
   460  	benchmarkSingleString(b, "b"+Repeat("a", 500), Repeat("a", 1002))
   461  }
   462  
   463  func BenchmarkSingleMatch(b *testing.B) {
   464  	benchmarkSingleString(b, "abcdef", Repeat("abcdefghijklmno", 1000))
   465  }
   466  
   467  func BenchmarkByteByteNoMatch(b *testing.B) {
   468  	str := Repeat("A", 100) + Repeat("B", 100)
   469  	for i := 0; i < b.N; i++ {
   470  		capitalLetters.Replace(str)
   471  	}
   472  }
   473  
   474  func BenchmarkByteByteMatch(b *testing.B) {
   475  	str := Repeat("a", 100) + Repeat("b", 100)
   476  	for i := 0; i < b.N; i++ {
   477  		capitalLetters.Replace(str)
   478  	}
   479  }
   480  
   481  func BenchmarkByteStringMatch(b *testing.B) {
   482  	str := "<" + Repeat("a", 99) + Repeat("b", 99) + ">"
   483  	for i := 0; i < b.N; i++ {
   484  		htmlEscaper.Replace(str)
   485  	}
   486  }
   487  
   488  func BenchmarkHTMLEscapeNew(b *testing.B) {
   489  	str := "I <3 to escape HTML & other text too."
   490  	for i := 0; i < b.N; i++ {
   491  		htmlEscaper.Replace(str)
   492  	}
   493  }
   494  
   495  func BenchmarkHTMLEscapeOld(b *testing.B) {
   496  	str := "I <3 to escape HTML & other text too."
   497  	for i := 0; i < b.N; i++ {
   498  		oldHTMLEscape(str)
   499  	}
   500  }
   501  
   502  func BenchmarkByteStringReplacerWriteString(b *testing.B) {
   503  	str := Repeat("I <3 to escape HTML & other text too.", 100)
   504  	buf := new(bytes.Buffer)
   505  	for i := 0; i < b.N; i++ {
   506  		htmlEscaper.WriteString(buf, str)
   507  		buf.Reset()
   508  	}
   509  }
   510  
   511  func BenchmarkByteReplacerWriteString(b *testing.B) {
   512  	str := Repeat("abcdefghijklmnopqrstuvwxyz", 100)
   513  	buf := new(bytes.Buffer)
   514  	for i := 0; i < b.N; i++ {
   515  		capitalLetters.WriteString(buf, str)
   516  		buf.Reset()
   517  	}
   518  }
   519  
   520  // BenchmarkByteByteReplaces compares byteByteImpl against multiple Replaces.
   521  func BenchmarkByteByteReplaces(b *testing.B) {
   522  	str := Repeat("a", 100) + Repeat("b", 100)
   523  	for i := 0; i < b.N; i++ {
   524  		Replace(Replace(str, "a", "A", -1), "b", "B", -1)
   525  	}
   526  }
   527  
   528  // BenchmarkByteByteMap compares byteByteImpl against Map.
   529  func BenchmarkByteByteMap(b *testing.B) {
   530  	str := Repeat("a", 100) + Repeat("b", 100)
   531  	fn := func(r rune) rune {
   532  		switch r {
   533  		case 'a':
   534  			return 'A'
   535  		case 'b':
   536  			return 'B'
   537  		}
   538  		return r
   539  	}
   540  	for i := 0; i < b.N; i++ {
   541  		Map(fn, str)
   542  	}
   543  }
   544  
   545  var mapdata = []struct{ name, data string }{
   546  	{"ASCII", "a b c d e f g h i j k l m n o p q r s t u v w x y z"},
   547  	{"Greek", "α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ ς σ τ υ φ χ ψ ω"},
   548  }
   549  
   550  func BenchmarkMap(b *testing.B) {
   551  	mapidentity := func(r rune) rune {
   552  		return r
   553  	}
   554  
   555  	b.Run("identity", func(b *testing.B) {
   556  		for _, md := range mapdata {
   557  			b.Run(md.name, func(b *testing.B) {
   558  				for i := 0; i < b.N; i++ {
   559  					Map(mapidentity, md.data)
   560  				}
   561  			})
   562  		}
   563  	})
   564  
   565  	mapchange := func(r rune) rune {
   566  		if 'a' <= r && r <= 'z' {
   567  			return r + 'A' - 'a'
   568  		}
   569  		if 'α' <= r && r <= 'ω' {
   570  			return r + 'Α' - 'α'
   571  		}
   572  		return r
   573  	}
   574  
   575  	b.Run("change", func(b *testing.B) {
   576  		for _, md := range mapdata {
   577  			b.Run(md.name, func(b *testing.B) {
   578  				for i := 0; i < b.N; i++ {
   579  					Map(mapchange, md.data)
   580  				}
   581  			})
   582  		}
   583  	})
   584  }