github.com/sbinet/go@v0.0.0-20160827155028-54d7de7dd62b/src/regexp/onepass.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package regexp 6 7 import ( 8 "bytes" 9 "regexp/syntax" 10 "sort" 11 "unicode" 12 ) 13 14 // "One-pass" regexp execution. 15 // Some regexps can be analyzed to determine that they never need 16 // backtracking: they are guaranteed to run in one pass over the string 17 // without bothering to save all the usual NFA state. 18 // Detect those and execute them more quickly. 19 20 // A onePassProg is a compiled one-pass regular expression program. 21 // It is the same as syntax.Prog except for the use of onePassInst. 22 type onePassProg struct { 23 Inst []onePassInst 24 Start int // index of start instruction 25 NumCap int // number of InstCapture insts in re 26 } 27 28 // A onePassInst is a single instruction in a one-pass regular expression program. 29 // It is the same as syntax.Inst except for the new 'Next' field. 30 type onePassInst struct { 31 syntax.Inst 32 Next []uint32 33 } 34 35 // OnePassPrefix returns a literal string that all matches for the 36 // regexp must start with. Complete is true if the prefix 37 // is the entire match. Pc is the index of the last rune instruction 38 // in the string. The OnePassPrefix skips over the mandatory 39 // EmptyBeginText 40 func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { 41 i := &p.Inst[p.Start] 42 if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { 43 return "", i.Op == syntax.InstMatch, uint32(p.Start) 44 } 45 pc = i.Out 46 i = &p.Inst[pc] 47 for i.Op == syntax.InstNop { 48 pc = i.Out 49 i = &p.Inst[pc] 50 } 51 // Avoid allocation of buffer if prefix is empty. 52 if iop(i) != syntax.InstRune || len(i.Rune) != 1 { 53 return "", i.Op == syntax.InstMatch, uint32(p.Start) 54 } 55 56 // Have prefix; gather characters. 57 var buf bytes.Buffer 58 for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 { 59 buf.WriteRune(i.Rune[0]) 60 pc, i = i.Out, &p.Inst[i.Out] 61 } 62 if i.Op == syntax.InstEmptyWidth && 63 syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 && 64 p.Inst[i.Out].Op == syntax.InstMatch { 65 complete = true 66 } 67 return buf.String(), complete, pc 68 } 69 70 // OnePassNext selects the next actionable state of the prog, based on the input character. 71 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. 72 // One of the alternates may ultimately lead without input to end of line. If the instruction 73 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. 74 func onePassNext(i *onePassInst, r rune) uint32 { 75 next := i.MatchRunePos(r) 76 if next >= 0 { 77 return i.Next[next] 78 } 79 if i.Op == syntax.InstAltMatch { 80 return i.Out 81 } 82 return 0 83 } 84 85 func iop(i *syntax.Inst) syntax.InstOp { 86 op := i.Op 87 switch op { 88 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 89 op = syntax.InstRune 90 } 91 return op 92 } 93 94 // Sparse Array implementation is used as a queueOnePass. 95 type queueOnePass struct { 96 sparse []uint32 97 dense []uint32 98 size, nextIndex uint32 99 } 100 101 func (q *queueOnePass) empty() bool { 102 return q.nextIndex >= q.size 103 } 104 105 func (q *queueOnePass) next() (n uint32) { 106 n = q.dense[q.nextIndex] 107 q.nextIndex++ 108 return 109 } 110 111 func (q *queueOnePass) clear() { 112 q.size = 0 113 q.nextIndex = 0 114 } 115 116 func (q *queueOnePass) contains(u uint32) bool { 117 if u >= uint32(len(q.sparse)) { 118 return false 119 } 120 return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u 121 } 122 123 func (q *queueOnePass) insert(u uint32) { 124 if !q.contains(u) { 125 q.insertNew(u) 126 } 127 } 128 129 func (q *queueOnePass) insertNew(u uint32) { 130 if u >= uint32(len(q.sparse)) { 131 return 132 } 133 q.sparse[u] = q.size 134 q.dense[q.size] = u 135 q.size++ 136 } 137 138 func newQueue(size int) (q *queueOnePass) { 139 return &queueOnePass{ 140 sparse: make([]uint32, size), 141 dense: make([]uint32, size), 142 } 143 } 144 145 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result, 146 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index 147 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a 148 // NextIp array with the single element mergeFailed is returned. 149 // The code assumes that both inputs contain ordered and non-intersecting rune pairs. 150 const mergeFailed = uint32(0xffffffff) 151 152 var ( 153 noRune = []rune{} 154 noNext = []uint32{mergeFailed} 155 ) 156 157 func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { 158 leftLen := len(*leftRunes) 159 rightLen := len(*rightRunes) 160 if leftLen&0x1 != 0 || rightLen&0x1 != 0 { 161 panic("mergeRuneSets odd length []rune") 162 } 163 var ( 164 lx, rx int 165 ) 166 merged := make([]rune, 0) 167 next := make([]uint32, 0) 168 ok := true 169 defer func() { 170 if !ok { 171 merged = nil 172 next = nil 173 } 174 }() 175 176 ix := -1 177 extend := func(newLow *int, newArray *[]rune, pc uint32) bool { 178 if ix > 0 && (*newArray)[*newLow] <= merged[ix] { 179 return false 180 } 181 merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) 182 *newLow += 2 183 ix += 2 184 next = append(next, pc) 185 return true 186 } 187 188 for lx < leftLen || rx < rightLen { 189 switch { 190 case rx >= rightLen: 191 ok = extend(&lx, leftRunes, leftPC) 192 case lx >= leftLen: 193 ok = extend(&rx, rightRunes, rightPC) 194 case (*rightRunes)[rx] < (*leftRunes)[lx]: 195 ok = extend(&rx, rightRunes, rightPC) 196 default: 197 ok = extend(&lx, leftRunes, leftPC) 198 } 199 if !ok { 200 return noRune, noNext 201 } 202 } 203 return merged, next 204 } 205 206 // cleanupOnePass drops working memory, and restores certain shortcut instructions. 207 func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { 208 for ix, instOriginal := range original.Inst { 209 switch instOriginal.Op { 210 case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: 211 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: 212 prog.Inst[ix].Next = nil 213 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 214 prog.Inst[ix].Next = nil 215 prog.Inst[ix] = onePassInst{Inst: instOriginal} 216 } 217 } 218 } 219 220 // onePassCopy creates a copy of the original Prog, as we'll be modifying it 221 func onePassCopy(prog *syntax.Prog) *onePassProg { 222 p := &onePassProg{ 223 Start: prog.Start, 224 NumCap: prog.NumCap, 225 } 226 for _, inst := range prog.Inst { 227 p.Inst = append(p.Inst, onePassInst{Inst: inst}) 228 } 229 230 // rewrites one or more common Prog constructs that enable some otherwise 231 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at 232 // ip A, that points to ips B & C. 233 // A:BC + B:DA => A:BC + B:CD 234 // A:BC + B:DC => A:DC + B:DC 235 for pc := range p.Inst { 236 switch p.Inst[pc].Op { 237 default: 238 continue 239 case syntax.InstAlt, syntax.InstAltMatch: 240 // A:Bx + B:Ay 241 p_A_Other := &p.Inst[pc].Out 242 p_A_Alt := &p.Inst[pc].Arg 243 // make sure a target is another Alt 244 instAlt := p.Inst[*p_A_Alt] 245 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 246 p_A_Alt, p_A_Other = p_A_Other, p_A_Alt 247 instAlt = p.Inst[*p_A_Alt] 248 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 249 continue 250 } 251 } 252 instOther := p.Inst[*p_A_Other] 253 // Analyzing both legs pointing to Alts is for another day 254 if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { 255 // too complicated 256 continue 257 } 258 // simple empty transition loop 259 // A:BC + B:DA => A:BC + B:DC 260 p_B_Alt := &p.Inst[*p_A_Alt].Out 261 p_B_Other := &p.Inst[*p_A_Alt].Arg 262 patch := false 263 if instAlt.Out == uint32(pc) { 264 patch = true 265 } else if instAlt.Arg == uint32(pc) { 266 patch = true 267 p_B_Alt, p_B_Other = p_B_Other, p_B_Alt 268 } 269 if patch { 270 *p_B_Alt = *p_A_Other 271 } 272 273 // empty transition to common target 274 // A:BC + B:DC => A:DC + B:DC 275 if *p_A_Other == *p_B_Alt { 276 *p_A_Alt = *p_B_Other 277 } 278 } 279 } 280 return p 281 } 282 283 // runeSlice exists to permit sorting the case-folded rune sets. 284 type runeSlice []rune 285 286 func (p runeSlice) Len() int { return len(p) } 287 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 288 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 289 290 // Sort is a convenience method. 291 func (p runeSlice) Sort() { 292 sort.Sort(p) 293 } 294 295 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 296 var anyRune = []rune{0, unicode.MaxRune} 297 298 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, 299 // the match engine can always tell which branch to take. The routine may modify 300 // p if it is turned into a onepass Prog. If it isn't possible for this to be a 301 // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive 302 // to the size of the Prog. 303 func makeOnePass(p *onePassProg) *onePassProg { 304 // If the machine is very long, it's not worth the time to check if we can use one pass. 305 if len(p.Inst) >= 1000 { 306 return notOnePass 307 } 308 309 var ( 310 instQueue = newQueue(len(p.Inst)) 311 visitQueue = newQueue(len(p.Inst)) 312 check func(uint32, map[uint32]bool) bool 313 onePassRunes = make([][]rune, len(p.Inst)) 314 ) 315 316 // check that paths from Alt instructions are unambiguous, and rebuild the new 317 // program as a onepass program 318 check = func(pc uint32, m map[uint32]bool) (ok bool) { 319 ok = true 320 inst := &p.Inst[pc] 321 if visitQueue.contains(pc) { 322 return 323 } 324 visitQueue.insert(pc) 325 switch inst.Op { 326 case syntax.InstAlt, syntax.InstAltMatch: 327 ok = check(inst.Out, m) && check(inst.Arg, m) 328 // check no-input paths to InstMatch 329 matchOut := m[inst.Out] 330 matchArg := m[inst.Arg] 331 if matchOut && matchArg { 332 ok = false 333 break 334 } 335 // Match on empty goes in inst.Out 336 if matchArg { 337 inst.Out, inst.Arg = inst.Arg, inst.Out 338 matchOut, matchArg = matchArg, matchOut 339 } 340 if matchOut { 341 m[pc] = true 342 inst.Op = syntax.InstAltMatch 343 } 344 345 // build a dispatch operator from the two legs of the alt. 346 onePassRunes[pc], inst.Next = mergeRuneSets( 347 &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) 348 if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { 349 ok = false 350 break 351 } 352 case syntax.InstCapture, syntax.InstNop: 353 ok = check(inst.Out, m) 354 m[pc] = m[inst.Out] 355 // pass matching runes back through these no-ops. 356 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 357 inst.Next = []uint32{} 358 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 359 inst.Next = append(inst.Next, inst.Out) 360 } 361 case syntax.InstEmptyWidth: 362 ok = check(inst.Out, m) 363 m[pc] = m[inst.Out] 364 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 365 inst.Next = []uint32{} 366 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 367 inst.Next = append(inst.Next, inst.Out) 368 } 369 case syntax.InstMatch, syntax.InstFail: 370 m[pc] = inst.Op == syntax.InstMatch 371 break 372 case syntax.InstRune: 373 m[pc] = false 374 if len(inst.Next) > 0 { 375 break 376 } 377 instQueue.insert(inst.Out) 378 if len(inst.Rune) == 0 { 379 onePassRunes[pc] = []rune{} 380 inst.Next = []uint32{inst.Out} 381 break 382 } 383 runes := make([]rune, 0) 384 if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 385 r0 := inst.Rune[0] 386 runes = append(runes, r0, r0) 387 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 388 runes = append(runes, r1, r1) 389 } 390 sort.Sort(runeSlice(runes)) 391 } else { 392 runes = append(runes, inst.Rune...) 393 } 394 onePassRunes[pc] = runes 395 inst.Next = []uint32{} 396 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 397 inst.Next = append(inst.Next, inst.Out) 398 } 399 inst.Op = syntax.InstRune 400 case syntax.InstRune1: 401 m[pc] = false 402 if len(inst.Next) > 0 { 403 break 404 } 405 instQueue.insert(inst.Out) 406 runes := []rune{} 407 // expand case-folded runes 408 if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 409 r0 := inst.Rune[0] 410 runes = append(runes, r0, r0) 411 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 412 runes = append(runes, r1, r1) 413 } 414 sort.Sort(runeSlice(runes)) 415 } else { 416 runes = append(runes, inst.Rune[0], inst.Rune[0]) 417 } 418 onePassRunes[pc] = runes 419 inst.Next = []uint32{} 420 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 421 inst.Next = append(inst.Next, inst.Out) 422 } 423 inst.Op = syntax.InstRune 424 case syntax.InstRuneAny: 425 m[pc] = false 426 if len(inst.Next) > 0 { 427 break 428 } 429 instQueue.insert(inst.Out) 430 onePassRunes[pc] = append([]rune{}, anyRune...) 431 inst.Next = []uint32{inst.Out} 432 case syntax.InstRuneAnyNotNL: 433 m[pc] = false 434 if len(inst.Next) > 0 { 435 break 436 } 437 instQueue.insert(inst.Out) 438 onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) 439 inst.Next = []uint32{} 440 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 441 inst.Next = append(inst.Next, inst.Out) 442 } 443 } 444 return 445 } 446 447 instQueue.clear() 448 instQueue.insert(uint32(p.Start)) 449 m := make(map[uint32]bool, len(p.Inst)) 450 for !instQueue.empty() { 451 visitQueue.clear() 452 pc := instQueue.next() 453 if !check(pc, m) { 454 p = notOnePass 455 break 456 } 457 } 458 if p != notOnePass { 459 for i := range p.Inst { 460 p.Inst[i].Rune = onePassRunes[i] 461 } 462 } 463 return p 464 } 465 466 var notOnePass *onePassProg = nil 467 468 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog 469 // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the 470 // Prog cannot be converted. For a one pass prog, the fundamental condition that must 471 // be true is: at any InstAlt, there must be no ambiguity about what branch to take. 472 func compileOnePass(prog *syntax.Prog) (p *onePassProg) { 473 if prog.Start == 0 { 474 return notOnePass 475 } 476 // onepass regexp is anchored 477 if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || 478 syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { 479 return notOnePass 480 } 481 // every instruction leading to InstMatch must be EmptyEndText 482 for _, inst := range prog.Inst { 483 opOut := prog.Inst[inst.Out].Op 484 switch inst.Op { 485 default: 486 if opOut == syntax.InstMatch { 487 return notOnePass 488 } 489 case syntax.InstAlt, syntax.InstAltMatch: 490 if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { 491 return notOnePass 492 } 493 case syntax.InstEmptyWidth: 494 if opOut == syntax.InstMatch { 495 if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { 496 continue 497 } 498 return notOnePass 499 } 500 } 501 } 502 // Creates a slightly optimized copy of the original Prog 503 // that cleans up some Prog idioms that block valid onepass programs 504 p = onePassCopy(prog) 505 506 // checkAmbiguity on InstAlts, build onepass Prog if possible 507 p = makeOnePass(p) 508 509 if p != notOnePass { 510 cleanupOnePass(p, prog) 511 } 512 return p 513 }