github.com/shijuvar/go@v0.0.0-20141209052335-e8f13700b70c/src/regexp/onepass.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package regexp 6 7 import ( 8 "bytes" 9 "regexp/syntax" 10 "sort" 11 "unicode" 12 ) 13 14 // "One-pass" regexp execution. 15 // Some regexps can be analyzed to determine that they never need 16 // backtracking: they are guaranteed to run in one pass over the string 17 // without bothering to save all the usual NFA state. 18 // Detect those and execute them more quickly. 19 20 // A onePassProg is a compiled one-pass regular expression program. 21 // It is the same as syntax.Prog except for the use of onePassInst. 22 type onePassProg struct { 23 Inst []onePassInst 24 Start int // index of start instruction 25 NumCap int // number of InstCapture insts in re 26 } 27 28 // A onePassInst is a single instruction in a one-pass regular expression program. 29 // It is the same as syntax.Inst except for the new 'Next' field. 30 type onePassInst struct { 31 syntax.Inst 32 Next []uint32 33 } 34 35 // OnePassPrefix returns a literal string that all matches for the 36 // regexp must start with. Complete is true if the prefix 37 // is the entire match. Pc is the index of the last rune instruction 38 // in the string. The OnePassPrefix skips over the mandatory 39 // EmptyBeginText 40 func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { 41 i := &p.Inst[p.Start] 42 if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { 43 return "", i.Op == syntax.InstMatch, uint32(p.Start) 44 } 45 pc = i.Out 46 i = &p.Inst[pc] 47 for i.Op == syntax.InstNop { 48 pc = i.Out 49 i = &p.Inst[pc] 50 } 51 // Avoid allocation of buffer if prefix is empty. 52 if iop(i) != syntax.InstRune || len(i.Rune) != 1 { 53 return "", i.Op == syntax.InstMatch, uint32(p.Start) 54 } 55 56 // Have prefix; gather characters. 57 var buf bytes.Buffer 58 for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 { 59 buf.WriteRune(i.Rune[0]) 60 pc, i = i.Out, &p.Inst[i.Out] 61 } 62 return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc 63 } 64 65 // OnePassNext selects the next actionable state of the prog, based on the input character. 66 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. 67 // One of the alternates may ultimately lead without input to end of line. If the instruction 68 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. 69 func onePassNext(i *onePassInst, r rune) uint32 { 70 next := i.MatchRunePos(r) 71 if next >= 0 { 72 return i.Next[next] 73 } 74 if i.Op == syntax.InstAltMatch { 75 return i.Out 76 } 77 return 0 78 } 79 80 func iop(i *syntax.Inst) syntax.InstOp { 81 op := i.Op 82 switch op { 83 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 84 op = syntax.InstRune 85 } 86 return op 87 } 88 89 // Sparse Array implementation is used as a queueOnePass. 90 type queueOnePass struct { 91 sparse []uint32 92 dense []uint32 93 size, nextIndex uint32 94 } 95 96 func (q *queueOnePass) empty() bool { 97 return q.nextIndex >= q.size 98 } 99 100 func (q *queueOnePass) next() (n uint32) { 101 n = q.dense[q.nextIndex] 102 q.nextIndex++ 103 return 104 } 105 106 func (q *queueOnePass) clear() { 107 q.size = 0 108 q.nextIndex = 0 109 } 110 111 func (q *queueOnePass) reset() { 112 q.nextIndex = 0 113 } 114 115 func (q *queueOnePass) contains(u uint32) bool { 116 if u >= uint32(len(q.sparse)) { 117 return false 118 } 119 return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u 120 } 121 122 func (q *queueOnePass) insert(u uint32) { 123 if !q.contains(u) { 124 q.insertNew(u) 125 } 126 } 127 128 func (q *queueOnePass) insertNew(u uint32) { 129 if u >= uint32(len(q.sparse)) { 130 return 131 } 132 q.sparse[u] = q.size 133 q.dense[q.size] = u 134 q.size++ 135 } 136 137 func newQueue(size int) (q *queueOnePass) { 138 return &queueOnePass{ 139 sparse: make([]uint32, size), 140 dense: make([]uint32, size), 141 } 142 } 143 144 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result, 145 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index 146 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a 147 // NextIp array with the single element mergeFailed is returned. 148 // The code assumes that both inputs contain ordered and non-intersecting rune pairs. 149 const mergeFailed = uint32(0xffffffff) 150 151 var ( 152 noRune = []rune{} 153 noNext = []uint32{mergeFailed} 154 ) 155 156 func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { 157 leftLen := len(*leftRunes) 158 rightLen := len(*rightRunes) 159 if leftLen&0x1 != 0 || rightLen&0x1 != 0 { 160 panic("mergeRuneSets odd length []rune") 161 } 162 var ( 163 lx, rx int 164 ) 165 merged := make([]rune, 0) 166 next := make([]uint32, 0) 167 ok := true 168 defer func() { 169 if !ok { 170 merged = nil 171 next = nil 172 } 173 }() 174 175 ix := -1 176 extend := func(newLow *int, newArray *[]rune, pc uint32) bool { 177 if ix > 0 && (*newArray)[*newLow] <= merged[ix] { 178 return false 179 } 180 merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) 181 *newLow += 2 182 ix += 2 183 next = append(next, pc) 184 return true 185 } 186 187 for lx < leftLen || rx < rightLen { 188 switch { 189 case rx >= rightLen: 190 ok = extend(&lx, leftRunes, leftPC) 191 case lx >= leftLen: 192 ok = extend(&rx, rightRunes, rightPC) 193 case (*rightRunes)[rx] < (*leftRunes)[lx]: 194 ok = extend(&rx, rightRunes, rightPC) 195 default: 196 ok = extend(&lx, leftRunes, leftPC) 197 } 198 if !ok { 199 return noRune, noNext 200 } 201 } 202 return merged, next 203 } 204 205 // cleanupOnePass drops working memory, and restores certain shortcut instructions. 206 func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { 207 for ix, instOriginal := range original.Inst { 208 switch instOriginal.Op { 209 case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: 210 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: 211 prog.Inst[ix].Next = nil 212 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 213 prog.Inst[ix].Next = nil 214 prog.Inst[ix] = onePassInst{Inst: instOriginal} 215 } 216 } 217 } 218 219 // onePassCopy creates a copy of the original Prog, as we'll be modifying it 220 func onePassCopy(prog *syntax.Prog) *onePassProg { 221 p := &onePassProg{ 222 Start: prog.Start, 223 NumCap: prog.NumCap, 224 } 225 for _, inst := range prog.Inst { 226 p.Inst = append(p.Inst, onePassInst{Inst: inst}) 227 } 228 229 // rewrites one or more common Prog constructs that enable some otherwise 230 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at 231 // ip A, that points to ips B & C. 232 // A:BC + B:DA => A:BC + B:CD 233 // A:BC + B:DC => A:DC + B:DC 234 for pc := range p.Inst { 235 switch p.Inst[pc].Op { 236 default: 237 continue 238 case syntax.InstAlt, syntax.InstAltMatch: 239 // A:Bx + B:Ay 240 p_A_Other := &p.Inst[pc].Out 241 p_A_Alt := &p.Inst[pc].Arg 242 // make sure a target is another Alt 243 instAlt := p.Inst[*p_A_Alt] 244 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 245 p_A_Alt, p_A_Other = p_A_Other, p_A_Alt 246 instAlt = p.Inst[*p_A_Alt] 247 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 248 continue 249 } 250 } 251 instOther := p.Inst[*p_A_Other] 252 // Analyzing both legs pointing to Alts is for another day 253 if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { 254 // too complicated 255 continue 256 } 257 // simple empty transition loop 258 // A:BC + B:DA => A:BC + B:DC 259 p_B_Alt := &p.Inst[*p_A_Alt].Out 260 p_B_Other := &p.Inst[*p_A_Alt].Arg 261 patch := false 262 if instAlt.Out == uint32(pc) { 263 patch = true 264 } else if instAlt.Arg == uint32(pc) { 265 patch = true 266 p_B_Alt, p_B_Other = p_B_Other, p_B_Alt 267 } 268 if patch { 269 *p_B_Alt = *p_A_Other 270 } 271 272 // empty transition to common target 273 // A:BC + B:DC => A:DC + B:DC 274 if *p_A_Other == *p_B_Alt { 275 *p_A_Alt = *p_B_Other 276 } 277 } 278 } 279 return p 280 } 281 282 // runeSlice exists to permit sorting the case-folded rune sets. 283 type runeSlice []rune 284 285 func (p runeSlice) Len() int { return len(p) } 286 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 287 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 288 289 // Sort is a convenience method. 290 func (p runeSlice) Sort() { 291 sort.Sort(p) 292 } 293 294 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 295 var anyRune = []rune{0, unicode.MaxRune} 296 297 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, 298 // the match engine can always tell which branch to take. The routine may modify 299 // p if it is turned into a onepass Prog. If it isn't possible for this to be a 300 // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive 301 // to the size of the Prog. 302 func makeOnePass(p *onePassProg) *onePassProg { 303 // If the machine is very long, it's not worth the time to check if we can use one pass. 304 if len(p.Inst) >= 1000 { 305 return notOnePass 306 } 307 308 var ( 309 instQueue = newQueue(len(p.Inst)) 310 visitQueue = newQueue(len(p.Inst)) 311 build func(uint32, *queueOnePass) 312 check func(uint32, map[uint32]bool) bool 313 onePassRunes = make([][]rune, len(p.Inst)) 314 ) 315 build = func(pc uint32, q *queueOnePass) { 316 if q.contains(pc) { 317 return 318 } 319 inst := p.Inst[pc] 320 switch inst.Op { 321 case syntax.InstAlt, syntax.InstAltMatch: 322 q.insert(inst.Out) 323 build(inst.Out, q) 324 q.insert(inst.Arg) 325 case syntax.InstMatch, syntax.InstFail: 326 default: 327 q.insert(inst.Out) 328 } 329 } 330 331 // check that paths from Alt instructions are unambiguous, and rebuild the new 332 // program as a onepass program 333 check = func(pc uint32, m map[uint32]bool) (ok bool) { 334 ok = true 335 inst := &p.Inst[pc] 336 if visitQueue.contains(pc) { 337 return 338 } 339 visitQueue.insert(pc) 340 switch inst.Op { 341 case syntax.InstAlt, syntax.InstAltMatch: 342 ok = check(inst.Out, m) && check(inst.Arg, m) 343 // check no-input paths to InstMatch 344 matchOut := m[inst.Out] 345 matchArg := m[inst.Arg] 346 if matchOut && matchArg { 347 ok = false 348 break 349 } 350 // Match on empty goes in inst.Out 351 if matchArg { 352 inst.Out, inst.Arg = inst.Arg, inst.Out 353 matchOut, matchArg = matchArg, matchOut 354 } 355 if matchOut { 356 m[pc] = true 357 inst.Op = syntax.InstAltMatch 358 } 359 360 // build a dispatch operator from the two legs of the alt. 361 onePassRunes[pc], inst.Next = mergeRuneSets( 362 &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) 363 if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { 364 ok = false 365 break 366 } 367 case syntax.InstCapture, syntax.InstNop: 368 ok = check(inst.Out, m) 369 m[pc] = m[inst.Out] 370 // pass matching runes back through these no-ops. 371 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 372 inst.Next = []uint32{} 373 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 374 inst.Next = append(inst.Next, inst.Out) 375 } 376 case syntax.InstEmptyWidth: 377 ok = check(inst.Out, m) 378 m[pc] = m[inst.Out] 379 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 380 inst.Next = []uint32{} 381 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 382 inst.Next = append(inst.Next, inst.Out) 383 } 384 case syntax.InstMatch, syntax.InstFail: 385 m[pc] = inst.Op == syntax.InstMatch 386 break 387 case syntax.InstRune: 388 ok = check(inst.Out, m) 389 m[pc] = false 390 if len(inst.Next) > 0 { 391 break 392 } 393 if len(inst.Rune) == 0 { 394 onePassRunes[pc] = []rune{} 395 inst.Next = []uint32{inst.Out} 396 break 397 } 398 runes := make([]rune, 0) 399 if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 400 r0 := inst.Rune[0] 401 runes = append(runes, r0, r0) 402 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 403 runes = append(runes, r1, r1) 404 } 405 sort.Sort(runeSlice(runes)) 406 } else { 407 runes = append(runes, inst.Rune...) 408 } 409 onePassRunes[pc] = runes 410 inst.Next = []uint32{} 411 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 412 inst.Next = append(inst.Next, inst.Out) 413 } 414 inst.Op = syntax.InstRune 415 case syntax.InstRune1: 416 ok = check(inst.Out, m) 417 m[pc] = false 418 if len(inst.Next) > 0 { 419 break 420 } 421 runes := []rune{} 422 // expand case-folded runes 423 if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 424 r0 := inst.Rune[0] 425 runes = append(runes, r0, r0) 426 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 427 runes = append(runes, r1, r1) 428 } 429 sort.Sort(runeSlice(runes)) 430 } else { 431 runes = append(runes, inst.Rune[0], inst.Rune[0]) 432 } 433 onePassRunes[pc] = runes 434 inst.Next = []uint32{} 435 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 436 inst.Next = append(inst.Next, inst.Out) 437 } 438 inst.Op = syntax.InstRune 439 case syntax.InstRuneAny: 440 ok = check(inst.Out, m) 441 m[pc] = false 442 if len(inst.Next) > 0 { 443 break 444 } 445 onePassRunes[pc] = append([]rune{}, anyRune...) 446 inst.Next = []uint32{inst.Out} 447 case syntax.InstRuneAnyNotNL: 448 ok = check(inst.Out, m) 449 m[pc] = false 450 if len(inst.Next) > 0 { 451 break 452 } 453 onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) 454 inst.Next = []uint32{} 455 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 456 inst.Next = append(inst.Next, inst.Out) 457 } 458 } 459 return 460 } 461 462 instQueue.clear() 463 instQueue.insert(uint32(p.Start)) 464 m := make(map[uint32]bool, len(p.Inst)) 465 for !instQueue.empty() { 466 pc := instQueue.next() 467 inst := p.Inst[pc] 468 visitQueue.clear() 469 if !check(uint32(pc), m) { 470 p = notOnePass 471 break 472 } 473 switch inst.Op { 474 case syntax.InstAlt, syntax.InstAltMatch: 475 instQueue.insert(inst.Out) 476 instQueue.insert(inst.Arg) 477 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop: 478 instQueue.insert(inst.Out) 479 case syntax.InstMatch: 480 case syntax.InstFail: 481 case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 482 default: 483 } 484 } 485 if p != notOnePass { 486 for i := range p.Inst { 487 p.Inst[i].Rune = onePassRunes[i] 488 } 489 } 490 return p 491 } 492 493 // walk visits each Inst in the prog once, and applies the argument 494 // function(ip, next), in pre-order. 495 func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) { 496 var walk1 func(uint32) 497 progQueue := newQueue(len(prog.Inst)) 498 walk1 = func(ip uint32) { 499 if progQueue.contains(ip) { 500 return 501 } 502 progQueue.insert(ip) 503 inst := prog.Inst[ip] 504 switch inst.Op { 505 case syntax.InstAlt, syntax.InstAltMatch: 506 for _, f := range funcs { 507 f(ip, inst.Out) 508 f(ip, inst.Arg) 509 } 510 walk1(inst.Out) 511 walk1(inst.Arg) 512 default: 513 for _, f := range funcs { 514 f(ip, inst.Out) 515 } 516 walk1(inst.Out) 517 } 518 } 519 walk1(uint32(prog.Start)) 520 } 521 522 // find returns the Insts that match the argument predicate function 523 func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) { 524 matches = []uint32{} 525 526 for ip := range prog.Inst { 527 if f(prog, ip) { 528 matches = append(matches, uint32(ip)) 529 } 530 } 531 return 532 } 533 534 var notOnePass *onePassProg = nil 535 536 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog 537 // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the 538 // Prog cannot be converted. For a one pass prog, the fundamental condition that must 539 // be true is: at any InstAlt, there must be no ambiguity about what branch to take. 540 func compileOnePass(prog *syntax.Prog) (p *onePassProg) { 541 if prog.Start == 0 { 542 return notOnePass 543 } 544 // onepass regexp is anchored 545 if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || 546 syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { 547 return notOnePass 548 } 549 // every instruction leading to InstMatch must be EmptyEndText 550 for _, inst := range prog.Inst { 551 opOut := prog.Inst[inst.Out].Op 552 switch inst.Op { 553 default: 554 if opOut == syntax.InstMatch { 555 return notOnePass 556 } 557 case syntax.InstAlt, syntax.InstAltMatch: 558 if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { 559 return notOnePass 560 } 561 case syntax.InstEmptyWidth: 562 if opOut == syntax.InstMatch { 563 if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { 564 continue 565 } 566 return notOnePass 567 } 568 } 569 } 570 // Creates a slightly optimized copy of the original Prog 571 // that cleans up some Prog idioms that block valid onepass programs 572 p = onePassCopy(prog) 573 574 // checkAmbiguity on InstAlts, build onepass Prog if possible 575 p = makeOnePass(p) 576 577 if p != notOnePass { 578 cleanupOnePass(p, prog) 579 } 580 return p 581 }