github.com/pgavlin/text@v0.0.0-20240419000839-8438d0a47805/replace_test.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package text_test 6 7 import ( 8 "bytes" 9 "fmt" 10 "testing" 11 12 . "github.com/pgavlin/text" 13 ) 14 15 var htmlEscaper = NewReplacer( 16 "&", "&", 17 "<", "<", 18 ">", ">", 19 `"`, """, 20 "'", "'", 21 ) 22 23 var htmlUnescaper = NewReplacer( 24 "&", "&", 25 "<", "<", 26 ">", ">", 27 """, `"`, 28 "'", "'", 29 ) 30 31 // The http package's old HTML escaping function. 32 func oldHTMLEscape(s string) string { 33 s = Replace(s, "&", "&", -1) 34 s = Replace(s, "<", "<", -1) 35 s = Replace(s, ">", ">", -1) 36 s = Replace(s, `"`, """, -1) 37 s = Replace(s, "'", "'", -1) 38 return s 39 } 40 41 var capitalLetters = NewReplacer("a", "A", "b", "B") 42 43 // TestReplacer tests the replacer implementations. 44 func TestReplacer(t *testing.T) { 45 type testCase struct { 46 r *Replacer[string] 47 in, out string 48 } 49 var testCases []testCase 50 51 // str converts 0xff to "\xff". This isn't just string(b) since that converts to UTF-8. 52 str := func(b byte) string { 53 return string([]byte{b}) 54 } 55 var s []string 56 57 // inc maps "\x00"->"\x01", ..., "a"->"b", "b"->"c", ..., "\xff"->"\x00". 58 s = nil 59 for i := 0; i < 256; i++ { 60 s = append(s, str(byte(i)), str(byte(i+1))) 61 } 62 inc := NewReplacer(s...) 63 64 // Test cases with 1-byte old strings, 1-byte new strings. 65 testCases = append(testCases, 66 testCase{capitalLetters, "brad", "BrAd"}, 67 testCase{capitalLetters, Repeat("a", (32<<10)+123), Repeat("A", (32<<10)+123)}, 68 testCase{capitalLetters, "", ""}, 69 70 testCase{inc, "brad", "csbe"}, 71 testCase{inc, "\x00\xff", "\x01\x00"}, 72 testCase{inc, "", ""}, 73 74 testCase{NewReplacer("a", "1", "a", "2"), "brad", "br1d"}, 75 ) 76 77 // repeat maps "a"->"a", "b"->"bb", "c"->"ccc", ... 78 s = nil 79 for i := 0; i < 256; i++ { 80 n := i + 1 - 'a' 81 if n < 1 { 82 n = 1 83 } 84 s = append(s, str(byte(i)), Repeat(str(byte(i)), n)) 85 } 86 repeat := NewReplacer(s...) 87 88 // Test cases with 1-byte old strings, variable length new strings. 89 testCases = append(testCases, 90 testCase{htmlEscaper, "No changes", "No changes"}, 91 testCase{htmlEscaper, "I <3 escaping & stuff", "I <3 escaping & stuff"}, 92 testCase{htmlEscaper, "&&&", "&&&"}, 93 testCase{htmlEscaper, "", ""}, 94 95 testCase{repeat, "brad", "bbrrrrrrrrrrrrrrrrrradddd"}, 96 testCase{repeat, "abba", "abbbba"}, 97 testCase{repeat, "", ""}, 98 99 testCase{NewReplacer("a", "11", "a", "22"), "brad", "br11d"}, 100 ) 101 102 // The remaining test cases have variable length old strings. 103 104 testCases = append(testCases, 105 testCase{htmlUnescaper, "&amp;", "&"}, 106 testCase{htmlUnescaper, "<b>HTML's neat</b>", "<b>HTML's neat</b>"}, 107 testCase{htmlUnescaper, "", ""}, 108 109 testCase{NewReplacer("a", "1", "a", "2", "xxx", "xxx"), "brad", "br1d"}, 110 111 testCase{NewReplacer("a", "1", "aa", "2", "aaa", "3"), "aaaa", "1111"}, 112 113 testCase{NewReplacer("aaa", "3", "aa", "2", "a", "1"), "aaaa", "31"}, 114 ) 115 116 // gen1 has multiple old strings of variable length. There is no 117 // overall non-empty common prefix, but some pairwise common prefixes. 118 gen1 := NewReplacer( 119 "aaa", "3[aaa]", 120 "aa", "2[aa]", 121 "a", "1[a]", 122 "i", "i", 123 "longerst", "most long", 124 "longer", "medium", 125 "long", "short", 126 "xx", "xx", 127 "x", "X", 128 "X", "Y", 129 "Y", "Z", 130 ) 131 testCases = append(testCases, 132 testCase{gen1, "fooaaabar", "foo3[aaa]b1[a]r"}, 133 testCase{gen1, "long, longerst, longer", "short, most long, medium"}, 134 testCase{gen1, "xxxxx", "xxxxX"}, 135 testCase{gen1, "XiX", "YiY"}, 136 testCase{gen1, "", ""}, 137 ) 138 139 // gen2 has multiple old strings with no pairwise common prefix. 140 gen2 := NewReplacer( 141 "roses", "red", 142 "violets", "blue", 143 "sugar", "sweet", 144 ) 145 testCases = append(testCases, 146 testCase{gen2, "roses are red, violets are blue...", "red are red, blue are blue..."}, 147 testCase{gen2, "", ""}, 148 ) 149 150 // gen3 has multiple old strings with an overall common prefix. 151 gen3 := NewReplacer( 152 "abracadabra", "poof", 153 "abracadabrakazam", "splat", 154 "abraham", "lincoln", 155 "abrasion", "scrape", 156 "abraham", "isaac", 157 ) 158 testCases = append(testCases, 159 testCase{gen3, "abracadabrakazam abraham", "poofkazam lincoln"}, 160 testCase{gen3, "abrasion abracad", "scrape abracad"}, 161 testCase{gen3, "abba abram abrasive", "abba abram abrasive"}, 162 testCase{gen3, "", ""}, 163 ) 164 165 // foo{1,2,3,4} have multiple old strings with an overall common prefix 166 // and 1- or 2- byte extensions from the common prefix. 167 foo1 := NewReplacer( 168 "foo1", "A", 169 "foo2", "B", 170 "foo3", "C", 171 ) 172 foo2 := NewReplacer( 173 "foo1", "A", 174 "foo2", "B", 175 "foo31", "C", 176 "foo32", "D", 177 ) 178 foo3 := NewReplacer( 179 "foo11", "A", 180 "foo12", "B", 181 "foo31", "C", 182 "foo32", "D", 183 ) 184 foo4 := NewReplacer( 185 "foo12", "B", 186 "foo32", "D", 187 ) 188 testCases = append(testCases, 189 testCase{foo1, "fofoofoo12foo32oo", "fofooA2C2oo"}, 190 testCase{foo1, "", ""}, 191 192 testCase{foo2, "fofoofoo12foo32oo", "fofooA2Doo"}, 193 testCase{foo2, "", ""}, 194 195 testCase{foo3, "fofoofoo12foo32oo", "fofooBDoo"}, 196 testCase{foo3, "", ""}, 197 198 testCase{foo4, "fofoofoo12foo32oo", "fofooBDoo"}, 199 testCase{foo4, "", ""}, 200 ) 201 202 // genAll maps "\x00\x01\x02...\xfe\xff" to "[all]", amongst other things. 203 allBytes := make([]byte, 256) 204 for i := range allBytes { 205 allBytes[i] = byte(i) 206 } 207 allString := string(allBytes) 208 genAll := NewReplacer( 209 allString, "[all]", 210 "\xff", "[ff]", 211 "\x00", "[00]", 212 ) 213 testCases = append(testCases, 214 testCase{genAll, allString, "[all]"}, 215 testCase{genAll, "a\xff" + allString + "\x00", "a[ff][all][00]"}, 216 testCase{genAll, "", ""}, 217 ) 218 219 // Test cases with empty old strings. 220 221 blankToX1 := NewReplacer("", "X") 222 blankToX2 := NewReplacer("", "X", "", "") 223 blankHighPriority := NewReplacer("", "X", "o", "O") 224 blankLowPriority := NewReplacer("o", "O", "", "X") 225 blankNoOp1 := NewReplacer("", "") 226 blankNoOp2 := NewReplacer("", "", "", "A") 227 blankFoo := NewReplacer("", "X", "foobar", "R", "foobaz", "Z") 228 testCases = append(testCases, 229 testCase{blankToX1, "foo", "XfXoXoX"}, 230 testCase{blankToX1, "", "X"}, 231 232 testCase{blankToX2, "foo", "XfXoXoX"}, 233 testCase{blankToX2, "", "X"}, 234 235 testCase{blankHighPriority, "oo", "XOXOX"}, 236 testCase{blankHighPriority, "ii", "XiXiX"}, 237 testCase{blankHighPriority, "oiio", "XOXiXiXOX"}, 238 testCase{blankHighPriority, "iooi", "XiXOXOXiX"}, 239 testCase{blankHighPriority, "", "X"}, 240 241 testCase{blankLowPriority, "oo", "OOX"}, 242 testCase{blankLowPriority, "ii", "XiXiX"}, 243 testCase{blankLowPriority, "oiio", "OXiXiOX"}, 244 testCase{blankLowPriority, "iooi", "XiOOXiX"}, 245 testCase{blankLowPriority, "", "X"}, 246 247 testCase{blankNoOp1, "foo", "foo"}, 248 testCase{blankNoOp1, "", ""}, 249 250 testCase{blankNoOp2, "foo", "foo"}, 251 testCase{blankNoOp2, "", ""}, 252 253 testCase{blankFoo, "foobarfoobaz", "XRXZX"}, 254 testCase{blankFoo, "foobar-foobaz", "XRX-XZX"}, 255 testCase{blankFoo, "", "X"}, 256 ) 257 258 // single string replacer 259 260 abcMatcher := NewReplacer("abc", "[match]") 261 262 testCases = append(testCases, 263 testCase{abcMatcher, "", ""}, 264 testCase{abcMatcher, "ab", "ab"}, 265 testCase{abcMatcher, "abc", "[match]"}, 266 testCase{abcMatcher, "abcd", "[match]d"}, 267 testCase{abcMatcher, "cabcabcdabca", "c[match][match]d[match]a"}, 268 ) 269 270 // Issue 6659 cases (more single string replacer) 271 272 noHello := NewReplacer("Hello", "") 273 testCases = append(testCases, 274 testCase{noHello, "Hello", ""}, 275 testCase{noHello, "Hellox", "x"}, 276 testCase{noHello, "xHello", "x"}, 277 testCase{noHello, "xHellox", "xx"}, 278 ) 279 280 // No-arg test cases. 281 282 nop := NewReplacer[string]() 283 testCases = append(testCases, 284 testCase{nop, "abc", "abc"}, 285 testCase{nop, "", ""}, 286 ) 287 288 // Run the test cases. 289 290 for i, tc := range testCases { 291 if s := tc.r.Replace(tc.in); s != tc.out { 292 t.Errorf("%d. Replace(%q) = %q, want %q", i, tc.in, s, tc.out) 293 } 294 var buf bytes.Buffer 295 n, err := tc.r.WriteString(&buf, tc.in) 296 if err != nil { 297 t.Errorf("%d. WriteString: %v", i, err) 298 continue 299 } 300 got := buf.String() 301 if got != tc.out { 302 t.Errorf("%d. WriteString(%q) wrote %q, want %q", i, tc.in, got, tc.out) 303 continue 304 } 305 if n != len(tc.out) { 306 t.Errorf("%d. WriteString(%q) wrote correct string but reported %d bytes; want %d (%q)", 307 i, tc.in, n, len(tc.out), tc.out) 308 } 309 } 310 } 311 312 var algorithmTestCases = []struct { 313 r *Replacer[string] 314 want string 315 }{ 316 {capitalLetters, "*text.byteReplacer[string]"}, 317 {htmlEscaper, "*text.byteStringReplacer[string]"}, 318 {NewReplacer("12", "123"), "*text.singleStringReplacer[string]"}, 319 {NewReplacer("1", "12"), "*text.byteStringReplacer[string]"}, 320 {NewReplacer("", "X"), "*text.genericReplacer[string]"}, 321 {NewReplacer("a", "1", "b", "12", "cde", "123"), "*text.genericReplacer[string]"}, 322 } 323 324 // TestPickAlgorithm tests that NewReplacer picks the correct algorithm. 325 func TestPickAlgorithm(t *testing.T) { 326 for i, tc := range algorithmTestCases { 327 got := fmt.Sprintf("%T", tc.r.Replacer()) 328 if got != tc.want { 329 t.Errorf("%d. algorithm = %s, want %s", i, got, tc.want) 330 } 331 } 332 } 333 334 type errWriter struct{} 335 336 func (errWriter) Write(p []byte) (n int, err error) { 337 return 0, fmt.Errorf("unwritable") 338 } 339 340 // TestWriteStringError tests that WriteString returns an error 341 // received from the underlying io.Writer. 342 func TestWriteStringError(t *testing.T) { 343 for i, tc := range algorithmTestCases { 344 n, err := tc.r.WriteString(errWriter{}, "abc") 345 if n != 0 || err == nil || err.Error() != "unwritable" { 346 t.Errorf("%d. WriteStringError = %d, %v, want 0, unwritable", i, n, err) 347 } 348 } 349 } 350 351 // TestGenericTrieBuilding verifies the structure of the generated trie. There 352 // is one node per line, and the key ending with the current line is in the 353 // trie if it ends with a "+". 354 func TestGenericTrieBuilding(t *testing.T) { 355 testCases := []struct{ in, out string }{ 356 {"abc;abdef;abdefgh;xx;xy;z", `- 357 a- 358 .b- 359 ..c+ 360 ..d- 361 ...ef+ 362 .....gh+ 363 x- 364 .x+ 365 .y+ 366 z+ 367 `}, 368 {"abracadabra;abracadabrakazam;abraham;abrasion", `- 369 a- 370 .bra- 371 ....c- 372 .....adabra+ 373 ...........kazam+ 374 ....h- 375 .....am+ 376 ....s- 377 .....ion+ 378 `}, 379 {"aaa;aa;a;i;longerst;longer;long;xx;x;X;Y", `- 380 X+ 381 Y+ 382 a+ 383 .a+ 384 ..a+ 385 i+ 386 l- 387 .ong+ 388 ....er+ 389 ......st+ 390 x+ 391 .x+ 392 `}, 393 {"foo;;foo;foo1", `+ 394 f- 395 .oo+ 396 ...1+ 397 `}, 398 } 399 400 for _, tc := range testCases { 401 keys := Split(tc.in, ";") 402 args := make([]string, len(keys)*2) 403 for i, key := range keys { 404 args[i*2] = key 405 } 406 407 got := NewReplacer(args...).PrintTrie() 408 // Remove tabs from tc.out 409 wantbuf := make([]byte, 0, len(tc.out)) 410 for i := 0; i < len(tc.out); i++ { 411 if tc.out[i] != '\t' { 412 wantbuf = append(wantbuf, tc.out[i]) 413 } 414 } 415 want := string(wantbuf) 416 417 if got != want { 418 t.Errorf("PrintTrie(%q)\ngot\n%swant\n%s", tc.in, got, want) 419 } 420 } 421 } 422 423 func BenchmarkGenericNoMatch(b *testing.B) { 424 str := Repeat("A", 100) + Repeat("B", 100) 425 generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic 426 for i := 0; i < b.N; i++ { 427 generic.Replace(str) 428 } 429 } 430 431 func BenchmarkGenericMatch1(b *testing.B) { 432 str := Repeat("a", 100) + Repeat("b", 100) 433 generic := NewReplacer("a", "A", "b", "B", "12", "123") 434 for i := 0; i < b.N; i++ { 435 generic.Replace(str) 436 } 437 } 438 439 func BenchmarkGenericMatch2(b *testing.B) { 440 str := Repeat("It's <b>HTML</b>!", 100) 441 for i := 0; i < b.N; i++ { 442 htmlUnescaper.Replace(str) 443 } 444 } 445 446 func benchmarkSingleString(b *testing.B, pattern, text string) { 447 r := NewReplacer(pattern, "[match]") 448 b.SetBytes(int64(len(text))) 449 b.ResetTimer() 450 for i := 0; i < b.N; i++ { 451 r.Replace(text) 452 } 453 } 454 455 func BenchmarkSingleMaxSkipping(b *testing.B) { 456 benchmarkSingleString(b, Repeat("b", 25), Repeat("a", 10000)) 457 } 458 459 func BenchmarkSingleLongSuffixFail(b *testing.B) { 460 benchmarkSingleString(b, "b"+Repeat("a", 500), Repeat("a", 1002)) 461 } 462 463 func BenchmarkSingleMatch(b *testing.B) { 464 benchmarkSingleString(b, "abcdef", Repeat("abcdefghijklmno", 1000)) 465 } 466 467 func BenchmarkByteByteNoMatch(b *testing.B) { 468 str := Repeat("A", 100) + Repeat("B", 100) 469 for i := 0; i < b.N; i++ { 470 capitalLetters.Replace(str) 471 } 472 } 473 474 func BenchmarkByteByteMatch(b *testing.B) { 475 str := Repeat("a", 100) + Repeat("b", 100) 476 for i := 0; i < b.N; i++ { 477 capitalLetters.Replace(str) 478 } 479 } 480 481 func BenchmarkByteStringMatch(b *testing.B) { 482 str := "<" + Repeat("a", 99) + Repeat("b", 99) + ">" 483 for i := 0; i < b.N; i++ { 484 htmlEscaper.Replace(str) 485 } 486 } 487 488 func BenchmarkHTMLEscapeNew(b *testing.B) { 489 str := "I <3 to escape HTML & other text too." 490 for i := 0; i < b.N; i++ { 491 htmlEscaper.Replace(str) 492 } 493 } 494 495 func BenchmarkHTMLEscapeOld(b *testing.B) { 496 str := "I <3 to escape HTML & other text too." 497 for i := 0; i < b.N; i++ { 498 oldHTMLEscape(str) 499 } 500 } 501 502 func BenchmarkByteStringReplacerWriteString(b *testing.B) { 503 str := Repeat("I <3 to escape HTML & other text too.", 100) 504 buf := new(bytes.Buffer) 505 for i := 0; i < b.N; i++ { 506 htmlEscaper.WriteString(buf, str) 507 buf.Reset() 508 } 509 } 510 511 func BenchmarkByteReplacerWriteString(b *testing.B) { 512 str := Repeat("abcdefghijklmnopqrstuvwxyz", 100) 513 buf := new(bytes.Buffer) 514 for i := 0; i < b.N; i++ { 515 capitalLetters.WriteString(buf, str) 516 buf.Reset() 517 } 518 } 519 520 // BenchmarkByteByteReplaces compares byteByteImpl against multiple Replaces. 521 func BenchmarkByteByteReplaces(b *testing.B) { 522 str := Repeat("a", 100) + Repeat("b", 100) 523 for i := 0; i < b.N; i++ { 524 Replace(Replace(str, "a", "A", -1), "b", "B", -1) 525 } 526 } 527 528 // BenchmarkByteByteMap compares byteByteImpl against Map. 529 func BenchmarkByteByteMap(b *testing.B) { 530 str := Repeat("a", 100) + Repeat("b", 100) 531 fn := func(r rune) rune { 532 switch r { 533 case 'a': 534 return 'A' 535 case 'b': 536 return 'B' 537 } 538 return r 539 } 540 for i := 0; i < b.N; i++ { 541 Map(fn, str) 542 } 543 } 544 545 var mapdata = []struct{ name, data string }{ 546 {"ASCII", "a b c d e f g h i j k l m n o p q r s t u v w x y z"}, 547 {"Greek", "α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ ς σ τ υ φ χ ψ ω"}, 548 } 549 550 func BenchmarkMap(b *testing.B) { 551 mapidentity := func(r rune) rune { 552 return r 553 } 554 555 b.Run("identity", func(b *testing.B) { 556 for _, md := range mapdata { 557 b.Run(md.name, func(b *testing.B) { 558 for i := 0; i < b.N; i++ { 559 Map(mapidentity, md.data) 560 } 561 }) 562 } 563 }) 564 565 mapchange := func(r rune) rune { 566 if 'a' <= r && r <= 'z' { 567 return r + 'A' - 'a' 568 } 569 if 'α' <= r && r <= 'ω' { 570 return r + 'Α' - 'α' 571 } 572 return r 573 } 574 575 b.Run("change", func(b *testing.B) { 576 for _, md := range mapdata { 577 b.Run(md.name, func(b *testing.B) { 578 for i := 0; i < b.N; i++ { 579 Map(mapchange, md.data) 580 } 581 }) 582 } 583 }) 584 }