github.com/ader1990/go@v0.0.0-20140630135419-8c24447fa791/src/pkg/regexp/onepass.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 3 package regexp 4 5 import ( 6 "bytes" 7 "regexp/syntax" 8 "sort" 9 "unicode" 10 ) 11 12 // Use of this source code is governed by a BSD-style 13 // license that can be found in the LICENSE file. 14 15 // "One-pass" regexp execution. 16 // Some regexps can be analyzed to determine that they never need 17 // backtracking: they are guaranteed to run in one pass over the string 18 // without bothering to save all the usual NFA state. 19 // Detect those and execute them more quickly. 20 21 // A onePassProg is a compiled one-pass regular expression program. 22 // It is the same as syntax.Prog except for the use of onePassInst. 23 type onePassProg struct { 24 Inst []onePassInst 25 Start int // index of start instruction 26 NumCap int // number of InstCapture insts in re 27 } 28 29 // A onePassInst is a single instruction in a one-pass regular expression program. 30 // It is the same as syntax.Inst except for the new 'Next' field. 31 type onePassInst struct { 32 syntax.Inst 33 Next []uint32 34 } 35 36 // OnePassPrefix returns a literal string that all matches for the 37 // regexp must start with. Complete is true if the prefix 38 // is the entire match. Pc is the index of the last rune instruction 39 // in the string. The OnePassPrefix skips over the mandatory 40 // EmptyBeginText 41 func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { 42 i := &p.Inst[p.Start] 43 if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { 44 return "", i.Op == syntax.InstMatch, uint32(p.Start) 45 } 46 pc = i.Out 47 i = &p.Inst[pc] 48 for i.Op == syntax.InstNop { 49 pc = i.Out 50 i = &p.Inst[pc] 51 } 52 // Avoid allocation of buffer if prefix is empty. 53 if iop(i) != syntax.InstRune || len(i.Rune) != 1 { 54 return "", i.Op == syntax.InstMatch, uint32(p.Start) 55 } 56 57 // Have prefix; gather characters. 58 var buf bytes.Buffer 59 for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 { 60 buf.WriteRune(i.Rune[0]) 61 pc, i = i.Out, &p.Inst[i.Out] 62 } 63 return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc 64 } 65 66 // OnePassNext selects the next actionable state of the prog, based on the input character. 67 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. 68 // One of the alternates may ultimately lead without input to end of line. If the instruction 69 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. 70 func onePassNext(i *onePassInst, r rune) uint32 { 71 next := i.MatchRunePos(r) 72 if next >= 0 { 73 return i.Next[next] 74 } 75 if i.Op == syntax.InstAltMatch { 76 return i.Out 77 } 78 return 0 79 } 80 81 func iop(i *syntax.Inst) syntax.InstOp { 82 op := i.Op 83 switch op { 84 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 85 op = syntax.InstRune 86 } 87 return op 88 } 89 90 // Sparse Array implementation is used as a queueOnePass. 91 type queueOnePass struct { 92 sparse []uint32 93 dense []uint32 94 size, nextIndex uint32 95 } 96 97 func (q *queueOnePass) empty() bool { 98 return q.nextIndex >= q.size 99 } 100 101 func (q *queueOnePass) next() (n uint32) { 102 n = q.dense[q.nextIndex] 103 q.nextIndex++ 104 return 105 } 106 107 func (q *queueOnePass) clear() { 108 q.size = 0 109 q.nextIndex = 0 110 } 111 112 func (q *queueOnePass) reset() { 113 q.nextIndex = 0 114 } 115 116 func (q *queueOnePass) contains(u uint32) bool { 117 if u >= uint32(len(q.sparse)) { 118 return false 119 } 120 return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u 121 } 122 123 func (q *queueOnePass) insert(u uint32) { 124 if !q.contains(u) { 125 q.insertNew(u) 126 } 127 } 128 129 func (q *queueOnePass) insertNew(u uint32) { 130 if u >= uint32(len(q.sparse)) { 131 return 132 } 133 q.sparse[u] = q.size 134 q.dense[q.size] = u 135 q.size++ 136 } 137 138 func newQueue(size int) (q *queueOnePass) { 139 return &queueOnePass{ 140 sparse: make([]uint32, size), 141 dense: make([]uint32, size), 142 } 143 } 144 145 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result, 146 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index 147 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a 148 // NextIp array with the single element mergeFailed is returned. 149 // The code assumes that both inputs contain ordered and non-intersecting rune pairs. 150 const mergeFailed = uint32(0xffffffff) 151 152 var ( 153 noRune = []rune{} 154 noNext = []uint32{mergeFailed} 155 ) 156 157 func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { 158 leftLen := len(*leftRunes) 159 rightLen := len(*rightRunes) 160 if leftLen&0x1 != 0 || rightLen&0x1 != 0 { 161 panic("mergeRuneSets odd length []rune") 162 } 163 var ( 164 lx, rx int 165 ) 166 merged := make([]rune, 0) 167 next := make([]uint32, 0) 168 ok := true 169 defer func() { 170 if !ok { 171 merged = nil 172 next = nil 173 } 174 }() 175 176 ix := -1 177 extend := func(newLow *int, newArray *[]rune, pc uint32) bool { 178 if ix > 0 && (*newArray)[*newLow] <= merged[ix] { 179 return false 180 } 181 merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) 182 *newLow += 2 183 ix += 2 184 next = append(next, pc) 185 return true 186 } 187 188 for lx < leftLen || rx < rightLen { 189 switch { 190 case rx >= rightLen: 191 ok = extend(&lx, leftRunes, leftPC) 192 case lx >= leftLen: 193 ok = extend(&rx, rightRunes, rightPC) 194 case (*rightRunes)[rx] < (*leftRunes)[lx]: 195 ok = extend(&rx, rightRunes, rightPC) 196 default: 197 ok = extend(&lx, leftRunes, leftPC) 198 } 199 if !ok { 200 return noRune, noNext 201 } 202 } 203 return merged, next 204 } 205 206 // cleanupOnePass drops working memory, and restores certain shortcut instructions. 207 func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { 208 for ix, instOriginal := range original.Inst { 209 switch instOriginal.Op { 210 case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: 211 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: 212 prog.Inst[ix].Next = nil 213 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 214 prog.Inst[ix].Next = nil 215 prog.Inst[ix] = onePassInst{Inst: instOriginal} 216 } 217 } 218 } 219 220 // onePassCopy creates a copy of the original Prog, as we'll be modifying it 221 func onePassCopy(prog *syntax.Prog) *onePassProg { 222 p := &onePassProg{ 223 Start: prog.Start, 224 NumCap: prog.NumCap, 225 } 226 for _, inst := range prog.Inst { 227 p.Inst = append(p.Inst, onePassInst{Inst: inst}) 228 } 229 230 // rewrites one or more common Prog constructs that enable some otherwise 231 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at 232 // ip A, that points to ips B & C. 233 // A:BC + B:DA => A:BC + B:CD 234 // A:BC + B:DC => A:DC + B:DC 235 for pc := range p.Inst { 236 switch p.Inst[pc].Op { 237 default: 238 continue 239 case syntax.InstAlt, syntax.InstAltMatch: 240 // A:Bx + B:Ay 241 p_A_Other := &p.Inst[pc].Out 242 p_A_Alt := &p.Inst[pc].Arg 243 // make sure a target is another Alt 244 instAlt := p.Inst[*p_A_Alt] 245 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 246 p_A_Alt, p_A_Other = p_A_Other, p_A_Alt 247 instAlt = p.Inst[*p_A_Alt] 248 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 249 continue 250 } 251 } 252 instOther := p.Inst[*p_A_Other] 253 // Analyzing both legs pointing to Alts is for another day 254 if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { 255 // too complicated 256 continue 257 } 258 // simple empty transition loop 259 // A:BC + B:DA => A:BC + B:DC 260 p_B_Alt := &p.Inst[*p_A_Alt].Out 261 p_B_Other := &p.Inst[*p_A_Alt].Arg 262 patch := false 263 if instAlt.Out == uint32(pc) { 264 patch = true 265 } else if instAlt.Arg == uint32(pc) { 266 patch = true 267 p_B_Alt, p_B_Other = p_B_Other, p_B_Alt 268 } 269 if patch { 270 *p_B_Alt = *p_A_Other 271 } 272 273 // empty transition to common target 274 // A:BC + B:DC => A:DC + B:DC 275 if *p_A_Other == *p_B_Alt { 276 *p_A_Alt = *p_B_Other 277 } 278 } 279 } 280 return p 281 } 282 283 // runeSlice exists to permit sorting the case-folded rune sets. 284 type runeSlice []rune 285 286 func (p runeSlice) Len() int { return len(p) } 287 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 288 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 289 290 // Sort is a convenience method. 291 func (p runeSlice) Sort() { 292 sort.Sort(p) 293 } 294 295 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 296 var anyRune = []rune{0, unicode.MaxRune} 297 298 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, 299 // the match engine can always tell which branch to take. The routine may modify 300 // p if it is turned into a onepass Prog. If it isn't possible for this to be a 301 // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive 302 // to the size of the Prog. 303 func makeOnePass(p *onePassProg) *onePassProg { 304 // If the machine is very long, it's not worth the time to check if we can use one pass. 305 if len(p.Inst) >= 1000 { 306 return notOnePass 307 } 308 309 var ( 310 instQueue = newQueue(len(p.Inst)) 311 visitQueue = newQueue(len(p.Inst)) 312 build func(uint32, *queueOnePass) 313 check func(uint32, map[uint32]bool) bool 314 onePassRunes = make([][]rune, len(p.Inst)) 315 ) 316 build = func(pc uint32, q *queueOnePass) { 317 if q.contains(pc) { 318 return 319 } 320 inst := p.Inst[pc] 321 switch inst.Op { 322 case syntax.InstAlt, syntax.InstAltMatch: 323 q.insert(inst.Out) 324 build(inst.Out, q) 325 q.insert(inst.Arg) 326 case syntax.InstMatch, syntax.InstFail: 327 default: 328 q.insert(inst.Out) 329 } 330 } 331 332 // check that paths from Alt instructions are unambiguous, and rebuild the new 333 // program as a onepass program 334 check = func(pc uint32, m map[uint32]bool) (ok bool) { 335 ok = true 336 inst := &p.Inst[pc] 337 if visitQueue.contains(pc) { 338 return 339 } 340 visitQueue.insert(pc) 341 switch inst.Op { 342 case syntax.InstAlt, syntax.InstAltMatch: 343 ok = check(inst.Out, m) && check(inst.Arg, m) 344 // check no-input paths to InstMatch 345 matchOut := m[inst.Out] 346 matchArg := m[inst.Arg] 347 if matchOut && matchArg { 348 ok = false 349 break 350 } 351 // Match on empty goes in inst.Out 352 if matchArg { 353 inst.Out, inst.Arg = inst.Arg, inst.Out 354 matchOut, matchArg = matchArg, matchOut 355 } 356 if matchOut { 357 m[pc] = true 358 inst.Op = syntax.InstAltMatch 359 } 360 361 // build a dispatch operator from the two legs of the alt. 362 onePassRunes[pc], inst.Next = mergeRuneSets( 363 &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) 364 if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { 365 ok = false 366 break 367 } 368 case syntax.InstCapture, syntax.InstNop: 369 ok = check(inst.Out, m) 370 m[pc] = m[inst.Out] 371 // pass matching runes back through these no-ops. 372 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 373 inst.Next = []uint32{} 374 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 375 inst.Next = append(inst.Next, inst.Out) 376 } 377 case syntax.InstEmptyWidth: 378 ok = check(inst.Out, m) 379 m[pc] = m[inst.Out] 380 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 381 inst.Next = []uint32{} 382 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 383 inst.Next = append(inst.Next, inst.Out) 384 } 385 case syntax.InstMatch, syntax.InstFail: 386 m[pc] = inst.Op == syntax.InstMatch 387 break 388 case syntax.InstRune: 389 ok = check(inst.Out, m) 390 m[pc] = false 391 if len(inst.Next) > 0 { 392 break 393 } 394 if len(inst.Rune) == 0 { 395 onePassRunes[pc] = []rune{} 396 inst.Next = []uint32{inst.Out} 397 break 398 } 399 runes := make([]rune, 0) 400 if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 401 r0 := inst.Rune[0] 402 runes = append(runes, r0, r0) 403 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 404 runes = append(runes, r1, r1) 405 } 406 sort.Sort(runeSlice(runes)) 407 } else { 408 runes = append(runes, inst.Rune...) 409 } 410 onePassRunes[pc] = runes 411 inst.Next = []uint32{} 412 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 413 inst.Next = append(inst.Next, inst.Out) 414 } 415 inst.Op = syntax.InstRune 416 case syntax.InstRune1: 417 ok = check(inst.Out, m) 418 m[pc] = false 419 if len(inst.Next) > 0 { 420 break 421 } 422 runes := []rune{} 423 // expand case-folded runes 424 if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 425 r0 := inst.Rune[0] 426 runes = append(runes, r0, r0) 427 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 428 runes = append(runes, r1, r1) 429 } 430 sort.Sort(runeSlice(runes)) 431 } else { 432 runes = append(runes, inst.Rune[0], inst.Rune[0]) 433 } 434 onePassRunes[pc] = runes 435 inst.Next = []uint32{} 436 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 437 inst.Next = append(inst.Next, inst.Out) 438 } 439 inst.Op = syntax.InstRune 440 case syntax.InstRuneAny: 441 ok = check(inst.Out, m) 442 m[pc] = false 443 if len(inst.Next) > 0 { 444 break 445 } 446 onePassRunes[pc] = append([]rune{}, anyRune...) 447 inst.Next = []uint32{inst.Out} 448 case syntax.InstRuneAnyNotNL: 449 ok = check(inst.Out, m) 450 m[pc] = false 451 if len(inst.Next) > 0 { 452 break 453 } 454 onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) 455 inst.Next = []uint32{} 456 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 457 inst.Next = append(inst.Next, inst.Out) 458 } 459 } 460 return 461 } 462 463 instQueue.clear() 464 instQueue.insert(uint32(p.Start)) 465 m := make(map[uint32]bool, len(p.Inst)) 466 for !instQueue.empty() { 467 pc := instQueue.next() 468 inst := p.Inst[pc] 469 visitQueue.clear() 470 if !check(uint32(pc), m) { 471 p = notOnePass 472 break 473 } 474 switch inst.Op { 475 case syntax.InstAlt, syntax.InstAltMatch: 476 instQueue.insert(inst.Out) 477 instQueue.insert(inst.Arg) 478 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop: 479 instQueue.insert(inst.Out) 480 case syntax.InstMatch: 481 case syntax.InstFail: 482 case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 483 default: 484 } 485 } 486 if p != notOnePass { 487 for i, _ := range p.Inst { 488 p.Inst[i].Rune = onePassRunes[i] 489 } 490 } 491 return p 492 } 493 494 // walk visits each Inst in the prog once, and applies the argument 495 // function(ip, next), in pre-order. 496 func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) { 497 var walk1 func(uint32) 498 progQueue := newQueue(len(prog.Inst)) 499 walk1 = func(ip uint32) { 500 if progQueue.contains(ip) { 501 return 502 } 503 progQueue.insert(ip) 504 inst := prog.Inst[ip] 505 switch inst.Op { 506 case syntax.InstAlt, syntax.InstAltMatch: 507 for _, f := range funcs { 508 f(ip, inst.Out) 509 f(ip, inst.Arg) 510 } 511 walk1(inst.Out) 512 walk1(inst.Arg) 513 default: 514 for _, f := range funcs { 515 f(ip, inst.Out) 516 } 517 walk1(inst.Out) 518 } 519 } 520 walk1(uint32(prog.Start)) 521 } 522 523 // find returns the Insts that match the argument predicate function 524 func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) { 525 matches = []uint32{} 526 527 for ip := range prog.Inst { 528 if f(prog, ip) { 529 matches = append(matches, uint32(ip)) 530 } 531 } 532 return 533 } 534 535 var notOnePass *onePassProg = nil 536 537 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog 538 // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the 539 // Prog cannot be converted. For a one pass prog, the fundamental condition that must 540 // be true is: at any InstAlt, there must be no ambiguity about what branch to take. 541 func compileOnePass(prog *syntax.Prog) (p *onePassProg) { 542 if prog.Start == 0 { 543 return notOnePass 544 } 545 // onepass regexp is anchored 546 if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || 547 syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { 548 return notOnePass 549 } 550 // every instruction leading to InstMatch must be EmptyEndText 551 for _, inst := range prog.Inst { 552 opOut := prog.Inst[inst.Out].Op 553 switch inst.Op { 554 default: 555 if opOut == syntax.InstMatch { 556 return notOnePass 557 } 558 case syntax.InstAlt, syntax.InstAltMatch: 559 if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { 560 return notOnePass 561 } 562 case syntax.InstEmptyWidth: 563 if opOut == syntax.InstMatch { 564 if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { 565 continue 566 } 567 return notOnePass 568 } 569 } 570 } 571 // Creates a slightly optimized copy of the original Prog 572 // that cleans up some Prog idioms that block valid onepass programs 573 p = onePassCopy(prog) 574 575 // checkAmbiguity on InstAlts, build onepass Prog if possible 576 p = makeOnePass(p) 577 578 if p != notOnePass { 579 cleanupOnePass(p, prog) 580 } 581 return p 582 }