github.com/andybalholm/brotli@v1.0.6/static_dict.go (about) 1 package brotli 2 3 import "encoding/binary" 4 5 /* Copyright 2013 Google Inc. All Rights Reserved. 6 7 Distributed under MIT license. 8 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 9 */ 10 11 /* Class to model the static dictionary. */ 12 13 const maxStaticDictionaryMatchLen = 37 14 15 const kInvalidMatch uint32 = 0xFFFFFFF 16 17 /* Copyright 2013 Google Inc. All Rights Reserved. 18 19 Distributed under MIT license. 20 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 21 */ 22 func hash(data []byte) uint32 { 23 var h uint32 = binary.LittleEndian.Uint32(data) * kDictHashMul32 24 25 /* The higher bits contain more mixture from the multiplication, 26 so we take our results from there. */ 27 return h >> uint(32-kDictNumBits) 28 } 29 30 func addMatch(distance uint, len uint, len_code uint, matches []uint32) { 31 var match uint32 = uint32((distance << 5) + len_code) 32 matches[len] = brotli_min_uint32_t(matches[len], match) 33 } 34 35 func dictMatchLength(dict *dictionary, data []byte, id uint, len uint, maxlen uint) uint { 36 var offset uint = uint(dict.offsets_by_length[len]) + len*id 37 return findMatchLengthWithLimit(dict.data[offset:], data, brotli_min_size_t(uint(len), maxlen)) 38 } 39 40 func isMatch(d *dictionary, w dictWord, data []byte, max_length uint) bool { 41 if uint(w.len) > max_length { 42 return false 43 } else { 44 var offset uint = uint(d.offsets_by_length[w.len]) + uint(w.len)*uint(w.idx) 45 var dict []byte = d.data[offset:] 46 if w.transform == 0 { 47 /* Match against base dictionary word. */ 48 return findMatchLengthWithLimit(dict, data, uint(w.len)) == uint(w.len) 49 } else if w.transform == 10 { 50 /* Match against uppercase first transform. 51 Note that there are only ASCII uppercase words in the lookup table. */ 52 return dict[0] >= 'a' && dict[0] <= 'z' && (dict[0]^32) == data[0] && findMatchLengthWithLimit(dict[1:], data[1:], uint(w.len)-1) == uint(w.len-1) 53 } else { 54 /* Match against uppercase all transform. 55 Note that there are only ASCII uppercase words in the lookup table. */ 56 var i uint 57 for i = 0; i < uint(w.len); i++ { 58 if dict[i] >= 'a' && dict[i] <= 'z' { 59 if (dict[i] ^ 32) != data[i] { 60 return false 61 } 62 } else { 63 if dict[i] != data[i] { 64 return false 65 } 66 } 67 } 68 69 return true 70 } 71 } 72 } 73 74 func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_length uint, max_length uint, matches []uint32) bool { 75 var has_found_match bool = false 76 { 77 var offset uint = uint(dict.buckets[hash(data)]) 78 var end bool = offset == 0 79 for !end { 80 w := dict.dict_words[offset] 81 offset++ 82 var l uint = uint(w.len) & 0x1F 83 var n uint = uint(1) << dict.words.size_bits_by_length[l] 84 var id uint = uint(w.idx) 85 end = !(w.len&0x80 == 0) 86 w.len = byte(l) 87 if w.transform == 0 { 88 var matchlen uint = dictMatchLength(dict.words, data, id, l, max_length) 89 var s []byte 90 var minlen uint 91 var maxlen uint 92 var len uint 93 94 /* Transform "" + BROTLI_TRANSFORM_IDENTITY + "" */ 95 if matchlen == l { 96 addMatch(id, l, l, matches) 97 has_found_match = true 98 } 99 100 /* Transforms "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "" and 101 "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "ing " */ 102 if matchlen >= l-1 { 103 addMatch(id+12*n, l-1, l, matches) 104 if l+2 < max_length && data[l-1] == 'i' && data[l] == 'n' && data[l+1] == 'g' && data[l+2] == ' ' { 105 addMatch(id+49*n, l+3, l, matches) 106 } 107 108 has_found_match = true 109 } 110 111 /* Transform "" + BROTLI_TRANSFORM_OMIT_LAST_# + "" (# = 2 .. 9) */ 112 minlen = min_length 113 114 if l > 9 { 115 minlen = brotli_max_size_t(minlen, l-9) 116 } 117 maxlen = brotli_min_size_t(matchlen, l-2) 118 for len = minlen; len <= maxlen; len++ { 119 var cut uint = l - len 120 var transform_id uint = (cut << 2) + uint((dict.cutoffTransforms>>(cut*6))&0x3F) 121 addMatch(id+transform_id*n, uint(len), l, matches) 122 has_found_match = true 123 } 124 125 if matchlen < l || l+6 >= max_length { 126 continue 127 } 128 129 s = data[l:] 130 131 /* Transforms "" + BROTLI_TRANSFORM_IDENTITY + <suffix> */ 132 if s[0] == ' ' { 133 addMatch(id+n, l+1, l, matches) 134 if s[1] == 'a' { 135 if s[2] == ' ' { 136 addMatch(id+28*n, l+3, l, matches) 137 } else if s[2] == 's' { 138 if s[3] == ' ' { 139 addMatch(id+46*n, l+4, l, matches) 140 } 141 } else if s[2] == 't' { 142 if s[3] == ' ' { 143 addMatch(id+60*n, l+4, l, matches) 144 } 145 } else if s[2] == 'n' { 146 if s[3] == 'd' && s[4] == ' ' { 147 addMatch(id+10*n, l+5, l, matches) 148 } 149 } 150 } else if s[1] == 'b' { 151 if s[2] == 'y' && s[3] == ' ' { 152 addMatch(id+38*n, l+4, l, matches) 153 } 154 } else if s[1] == 'i' { 155 if s[2] == 'n' { 156 if s[3] == ' ' { 157 addMatch(id+16*n, l+4, l, matches) 158 } 159 } else if s[2] == 's' { 160 if s[3] == ' ' { 161 addMatch(id+47*n, l+4, l, matches) 162 } 163 } 164 } else if s[1] == 'f' { 165 if s[2] == 'o' { 166 if s[3] == 'r' && s[4] == ' ' { 167 addMatch(id+25*n, l+5, l, matches) 168 } 169 } else if s[2] == 'r' { 170 if s[3] == 'o' && s[4] == 'm' && s[5] == ' ' { 171 addMatch(id+37*n, l+6, l, matches) 172 } 173 } 174 } else if s[1] == 'o' { 175 if s[2] == 'f' { 176 if s[3] == ' ' { 177 addMatch(id+8*n, l+4, l, matches) 178 } 179 } else if s[2] == 'n' { 180 if s[3] == ' ' { 181 addMatch(id+45*n, l+4, l, matches) 182 } 183 } 184 } else if s[1] == 'n' { 185 if s[2] == 'o' && s[3] == 't' && s[4] == ' ' { 186 addMatch(id+80*n, l+5, l, matches) 187 } 188 } else if s[1] == 't' { 189 if s[2] == 'h' { 190 if s[3] == 'e' { 191 if s[4] == ' ' { 192 addMatch(id+5*n, l+5, l, matches) 193 } 194 } else if s[3] == 'a' { 195 if s[4] == 't' && s[5] == ' ' { 196 addMatch(id+29*n, l+6, l, matches) 197 } 198 } 199 } else if s[2] == 'o' { 200 if s[3] == ' ' { 201 addMatch(id+17*n, l+4, l, matches) 202 } 203 } 204 } else if s[1] == 'w' { 205 if s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ' { 206 addMatch(id+35*n, l+6, l, matches) 207 } 208 } 209 } else if s[0] == '"' { 210 addMatch(id+19*n, l+1, l, matches) 211 if s[1] == '>' { 212 addMatch(id+21*n, l+2, l, matches) 213 } 214 } else if s[0] == '.' { 215 addMatch(id+20*n, l+1, l, matches) 216 if s[1] == ' ' { 217 addMatch(id+31*n, l+2, l, matches) 218 if s[2] == 'T' && s[3] == 'h' { 219 if s[4] == 'e' { 220 if s[5] == ' ' { 221 addMatch(id+43*n, l+6, l, matches) 222 } 223 } else if s[4] == 'i' { 224 if s[5] == 's' && s[6] == ' ' { 225 addMatch(id+75*n, l+7, l, matches) 226 } 227 } 228 } 229 } 230 } else if s[0] == ',' { 231 addMatch(id+76*n, l+1, l, matches) 232 if s[1] == ' ' { 233 addMatch(id+14*n, l+2, l, matches) 234 } 235 } else if s[0] == '\n' { 236 addMatch(id+22*n, l+1, l, matches) 237 if s[1] == '\t' { 238 addMatch(id+50*n, l+2, l, matches) 239 } 240 } else if s[0] == ']' { 241 addMatch(id+24*n, l+1, l, matches) 242 } else if s[0] == '\'' { 243 addMatch(id+36*n, l+1, l, matches) 244 } else if s[0] == ':' { 245 addMatch(id+51*n, l+1, l, matches) 246 } else if s[0] == '(' { 247 addMatch(id+57*n, l+1, l, matches) 248 } else if s[0] == '=' { 249 if s[1] == '"' { 250 addMatch(id+70*n, l+2, l, matches) 251 } else if s[1] == '\'' { 252 addMatch(id+86*n, l+2, l, matches) 253 } 254 } else if s[0] == 'a' { 255 if s[1] == 'l' && s[2] == ' ' { 256 addMatch(id+84*n, l+3, l, matches) 257 } 258 } else if s[0] == 'e' { 259 if s[1] == 'd' { 260 if s[2] == ' ' { 261 addMatch(id+53*n, l+3, l, matches) 262 } 263 } else if s[1] == 'r' { 264 if s[2] == ' ' { 265 addMatch(id+82*n, l+3, l, matches) 266 } 267 } else if s[1] == 's' { 268 if s[2] == 't' && s[3] == ' ' { 269 addMatch(id+95*n, l+4, l, matches) 270 } 271 } 272 } else if s[0] == 'f' { 273 if s[1] == 'u' && s[2] == 'l' && s[3] == ' ' { 274 addMatch(id+90*n, l+4, l, matches) 275 } 276 } else if s[0] == 'i' { 277 if s[1] == 'v' { 278 if s[2] == 'e' && s[3] == ' ' { 279 addMatch(id+92*n, l+4, l, matches) 280 } 281 } else if s[1] == 'z' { 282 if s[2] == 'e' && s[3] == ' ' { 283 addMatch(id+100*n, l+4, l, matches) 284 } 285 } 286 } else if s[0] == 'l' { 287 if s[1] == 'e' { 288 if s[2] == 's' && s[3] == 's' && s[4] == ' ' { 289 addMatch(id+93*n, l+5, l, matches) 290 } 291 } else if s[1] == 'y' { 292 if s[2] == ' ' { 293 addMatch(id+61*n, l+3, l, matches) 294 } 295 } 296 } else if s[0] == 'o' { 297 if s[1] == 'u' && s[2] == 's' && s[3] == ' ' { 298 addMatch(id+106*n, l+4, l, matches) 299 } 300 } 301 } else { 302 var is_all_caps bool = (w.transform != transformUppercaseFirst) 303 /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and 304 is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL) 305 transform. */ 306 307 var s []byte 308 if !isMatch(dict.words, w, data, max_length) { 309 continue 310 } 311 312 /* Transform "" + kUppercase{First,All} + "" */ 313 var tmp int 314 if is_all_caps { 315 tmp = 44 316 } else { 317 tmp = 9 318 } 319 addMatch(id+uint(tmp)*n, l, l, matches) 320 321 has_found_match = true 322 if l+1 >= max_length { 323 continue 324 } 325 326 /* Transforms "" + kUppercase{First,All} + <suffix> */ 327 s = data[l:] 328 329 if s[0] == ' ' { 330 var tmp int 331 if is_all_caps { 332 tmp = 68 333 } else { 334 tmp = 4 335 } 336 addMatch(id+uint(tmp)*n, l+1, l, matches) 337 } else if s[0] == '"' { 338 var tmp int 339 if is_all_caps { 340 tmp = 87 341 } else { 342 tmp = 66 343 } 344 addMatch(id+uint(tmp)*n, l+1, l, matches) 345 if s[1] == '>' { 346 var tmp int 347 if is_all_caps { 348 tmp = 97 349 } else { 350 tmp = 69 351 } 352 addMatch(id+uint(tmp)*n, l+2, l, matches) 353 } 354 } else if s[0] == '.' { 355 var tmp int 356 if is_all_caps { 357 tmp = 101 358 } else { 359 tmp = 79 360 } 361 addMatch(id+uint(tmp)*n, l+1, l, matches) 362 if s[1] == ' ' { 363 var tmp int 364 if is_all_caps { 365 tmp = 114 366 } else { 367 tmp = 88 368 } 369 addMatch(id+uint(tmp)*n, l+2, l, matches) 370 } 371 } else if s[0] == ',' { 372 var tmp int 373 if is_all_caps { 374 tmp = 112 375 } else { 376 tmp = 99 377 } 378 addMatch(id+uint(tmp)*n, l+1, l, matches) 379 if s[1] == ' ' { 380 var tmp int 381 if is_all_caps { 382 tmp = 107 383 } else { 384 tmp = 58 385 } 386 addMatch(id+uint(tmp)*n, l+2, l, matches) 387 } 388 } else if s[0] == '\'' { 389 var tmp int 390 if is_all_caps { 391 tmp = 94 392 } else { 393 tmp = 74 394 } 395 addMatch(id+uint(tmp)*n, l+1, l, matches) 396 } else if s[0] == '(' { 397 var tmp int 398 if is_all_caps { 399 tmp = 113 400 } else { 401 tmp = 78 402 } 403 addMatch(id+uint(tmp)*n, l+1, l, matches) 404 } else if s[0] == '=' { 405 if s[1] == '"' { 406 var tmp int 407 if is_all_caps { 408 tmp = 105 409 } else { 410 tmp = 104 411 } 412 addMatch(id+uint(tmp)*n, l+2, l, matches) 413 } else if s[1] == '\'' { 414 var tmp int 415 if is_all_caps { 416 tmp = 116 417 } else { 418 tmp = 108 419 } 420 addMatch(id+uint(tmp)*n, l+2, l, matches) 421 } 422 } 423 } 424 } 425 } 426 427 /* Transforms with prefixes " " and "." */ 428 if max_length >= 5 && (data[0] == ' ' || data[0] == '.') { 429 var is_space bool = (data[0] == ' ') 430 var offset uint = uint(dict.buckets[hash(data[1:])]) 431 var end bool = offset == 0 432 for !end { 433 w := dict.dict_words[offset] 434 offset++ 435 var l uint = uint(w.len) & 0x1F 436 var n uint = uint(1) << dict.words.size_bits_by_length[l] 437 var id uint = uint(w.idx) 438 end = !(w.len&0x80 == 0) 439 w.len = byte(l) 440 if w.transform == 0 { 441 var s []byte 442 if !isMatch(dict.words, w, data[1:], max_length-1) { 443 continue 444 } 445 446 /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + "" and 447 "." + BROTLI_TRANSFORM_IDENTITY + "" */ 448 var tmp int 449 if is_space { 450 tmp = 6 451 } else { 452 tmp = 32 453 } 454 addMatch(id+uint(tmp)*n, l+1, l, matches) 455 456 has_found_match = true 457 if l+2 >= max_length { 458 continue 459 } 460 461 /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + <suffix> and 462 "." + BROTLI_TRANSFORM_IDENTITY + <suffix> 463 */ 464 s = data[l+1:] 465 466 if s[0] == ' ' { 467 var tmp int 468 if is_space { 469 tmp = 2 470 } else { 471 tmp = 77 472 } 473 addMatch(id+uint(tmp)*n, l+2, l, matches) 474 } else if s[0] == '(' { 475 var tmp int 476 if is_space { 477 tmp = 89 478 } else { 479 tmp = 67 480 } 481 addMatch(id+uint(tmp)*n, l+2, l, matches) 482 } else if is_space { 483 if s[0] == ',' { 484 addMatch(id+103*n, l+2, l, matches) 485 if s[1] == ' ' { 486 addMatch(id+33*n, l+3, l, matches) 487 } 488 } else if s[0] == '.' { 489 addMatch(id+71*n, l+2, l, matches) 490 if s[1] == ' ' { 491 addMatch(id+52*n, l+3, l, matches) 492 } 493 } else if s[0] == '=' { 494 if s[1] == '"' { 495 addMatch(id+81*n, l+3, l, matches) 496 } else if s[1] == '\'' { 497 addMatch(id+98*n, l+3, l, matches) 498 } 499 } 500 } 501 } else if is_space { 502 var is_all_caps bool = (w.transform != transformUppercaseFirst) 503 /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and 504 is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL) 505 transform. */ 506 507 var s []byte 508 if !isMatch(dict.words, w, data[1:], max_length-1) { 509 continue 510 } 511 512 /* Transforms " " + kUppercase{First,All} + "" */ 513 var tmp int 514 if is_all_caps { 515 tmp = 85 516 } else { 517 tmp = 30 518 } 519 addMatch(id+uint(tmp)*n, l+1, l, matches) 520 521 has_found_match = true 522 if l+2 >= max_length { 523 continue 524 } 525 526 /* Transforms " " + kUppercase{First,All} + <suffix> */ 527 s = data[l+1:] 528 529 if s[0] == ' ' { 530 var tmp int 531 if is_all_caps { 532 tmp = 83 533 } else { 534 tmp = 15 535 } 536 addMatch(id+uint(tmp)*n, l+2, l, matches) 537 } else if s[0] == ',' { 538 if !is_all_caps { 539 addMatch(id+109*n, l+2, l, matches) 540 } 541 542 if s[1] == ' ' { 543 var tmp int 544 if is_all_caps { 545 tmp = 111 546 } else { 547 tmp = 65 548 } 549 addMatch(id+uint(tmp)*n, l+3, l, matches) 550 } 551 } else if s[0] == '.' { 552 var tmp int 553 if is_all_caps { 554 tmp = 115 555 } else { 556 tmp = 96 557 } 558 addMatch(id+uint(tmp)*n, l+2, l, matches) 559 if s[1] == ' ' { 560 var tmp int 561 if is_all_caps { 562 tmp = 117 563 } else { 564 tmp = 91 565 } 566 addMatch(id+uint(tmp)*n, l+3, l, matches) 567 } 568 } else if s[0] == '=' { 569 if s[1] == '"' { 570 var tmp int 571 if is_all_caps { 572 tmp = 110 573 } else { 574 tmp = 118 575 } 576 addMatch(id+uint(tmp)*n, l+3, l, matches) 577 } else if s[1] == '\'' { 578 var tmp int 579 if is_all_caps { 580 tmp = 119 581 } else { 582 tmp = 120 583 } 584 addMatch(id+uint(tmp)*n, l+3, l, matches) 585 } 586 } 587 } 588 } 589 } 590 591 if max_length >= 6 { 592 /* Transforms with prefixes "e ", "s ", ", " and "\xC2\xA0" */ 593 if (data[1] == ' ' && (data[0] == 'e' || data[0] == 's' || data[0] == ',')) || (data[0] == 0xC2 && data[1] == 0xA0) { 594 var offset uint = uint(dict.buckets[hash(data[2:])]) 595 var end bool = offset == 0 596 for !end { 597 w := dict.dict_words[offset] 598 offset++ 599 var l uint = uint(w.len) & 0x1F 600 var n uint = uint(1) << dict.words.size_bits_by_length[l] 601 var id uint = uint(w.idx) 602 end = !(w.len&0x80 == 0) 603 w.len = byte(l) 604 if w.transform == 0 && isMatch(dict.words, w, data[2:], max_length-2) { 605 if data[0] == 0xC2 { 606 addMatch(id+102*n, l+2, l, matches) 607 has_found_match = true 608 } else if l+2 < max_length && data[l+2] == ' ' { 609 var t uint = 13 610 if data[0] == 'e' { 611 t = 18 612 } else if data[0] == 's' { 613 t = 7 614 } 615 addMatch(id+t*n, l+3, l, matches) 616 has_found_match = true 617 } 618 } 619 } 620 } 621 } 622 623 if max_length >= 9 { 624 /* Transforms with prefixes " the " and ".com/" */ 625 if (data[0] == ' ' && data[1] == 't' && data[2] == 'h' && data[3] == 'e' && data[4] == ' ') || (data[0] == '.' && data[1] == 'c' && data[2] == 'o' && data[3] == 'm' && data[4] == '/') { 626 var offset uint = uint(dict.buckets[hash(data[5:])]) 627 var end bool = offset == 0 628 for !end { 629 w := dict.dict_words[offset] 630 offset++ 631 var l uint = uint(w.len) & 0x1F 632 var n uint = uint(1) << dict.words.size_bits_by_length[l] 633 var id uint = uint(w.idx) 634 end = !(w.len&0x80 == 0) 635 w.len = byte(l) 636 if w.transform == 0 && isMatch(dict.words, w, data[5:], max_length-5) { 637 var tmp int 638 if data[0] == ' ' { 639 tmp = 41 640 } else { 641 tmp = 72 642 } 643 addMatch(id+uint(tmp)*n, l+5, l, matches) 644 has_found_match = true 645 if l+5 < max_length { 646 var s []byte = data[l+5:] 647 if data[0] == ' ' { 648 if l+8 < max_length && s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ' { 649 addMatch(id+62*n, l+9, l, matches) 650 if l+12 < max_length && s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ' { 651 addMatch(id+73*n, l+13, l, matches) 652 } 653 } 654 } 655 } 656 } 657 } 658 } 659 } 660 661 return has_found_match 662 }