github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/regexp/onepass.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package regexp 6 7 import ( 8 "bytes" 9 "regexp/syntax" 10 "sort" 11 "unicode" 12 ) 13 14 // "One-pass" regexp execution. 15 // Some regexps can be analyzed to determine that they never need 16 // backtracking: they are guaranteed to run in one pass over the string 17 // without bothering to save all the usual NFA state. 18 // Detect those and execute them more quickly. 19 20 // A onePassProg is a compiled one-pass regular expression program. 21 // It is the same as syntax.Prog except for the use of onePassInst. 22 type onePassProg struct { 23 Inst []onePassInst 24 Start int // index of start instruction 25 NumCap int // number of InstCapture insts in re 26 } 27 28 // A onePassInst is a single instruction in a one-pass regular expression program. 29 // It is the same as syntax.Inst except for the new 'Next' field. 30 type onePassInst struct { 31 syntax.Inst 32 Next []uint32 33 } 34 35 // OnePassPrefix returns a literal string that all matches for the 36 // regexp must start with. Complete is true if the prefix 37 // is the entire match. Pc is the index of the last rune instruction 38 // in the string. The OnePassPrefix skips over the mandatory 39 // EmptyBeginText 40 func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { 41 i := &p.Inst[p.Start] 42 if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { 43 return "", i.Op == syntax.InstMatch, uint32(p.Start) 44 } 45 pc = i.Out 46 i = &p.Inst[pc] 47 for i.Op == syntax.InstNop { 48 pc = i.Out 49 i = &p.Inst[pc] 50 } 51 // Avoid allocation of buffer if prefix is empty. 52 if iop(i) != syntax.InstRune || len(i.Rune) != 1 { 53 return "", i.Op == syntax.InstMatch, uint32(p.Start) 54 } 55 56 // Have prefix; gather characters. 57 var buf bytes.Buffer 58 for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 { 59 buf.WriteRune(i.Rune[0]) 60 pc, i = i.Out, &p.Inst[i.Out] 61 } 62 if i.Op == syntax.InstEmptyWidth && 63 syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 && 64 p.Inst[i.Out].Op == syntax.InstMatch { 65 complete = true 66 } 67 return buf.String(), complete, pc 68 } 69 70 // OnePassNext selects the next actionable state of the prog, based on the input character. 71 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. 72 // One of the alternates may ultimately lead without input to end of line. If the instruction 73 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. 74 func onePassNext(i *onePassInst, r rune) uint32 { 75 next := i.MatchRunePos(r) 76 if next >= 0 { 77 return i.Next[next] 78 } 79 if i.Op == syntax.InstAltMatch { 80 return i.Out 81 } 82 return 0 83 } 84 85 func iop(i *syntax.Inst) syntax.InstOp { 86 op := i.Op 87 switch op { 88 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 89 op = syntax.InstRune 90 } 91 return op 92 } 93 94 // Sparse Array implementation is used as a queueOnePass. 95 type queueOnePass struct { 96 sparse []uint32 97 dense []uint32 98 size, nextIndex uint32 99 } 100 101 func (q *queueOnePass) empty() bool { 102 return q.nextIndex >= q.size 103 } 104 105 func (q *queueOnePass) next() (n uint32) { 106 n = q.dense[q.nextIndex] 107 q.nextIndex++ 108 return 109 } 110 111 func (q *queueOnePass) clear() { 112 q.size = 0 113 q.nextIndex = 0 114 } 115 116 func (q *queueOnePass) contains(u uint32) bool { 117 if u >= uint32(len(q.sparse)) { 118 return false 119 } 120 return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u 121 } 122 123 func (q *queueOnePass) insert(u uint32) { 124 if !q.contains(u) { 125 q.insertNew(u) 126 } 127 } 128 129 func (q *queueOnePass) insertNew(u uint32) { 130 if u >= uint32(len(q.sparse)) { 131 return 132 } 133 q.sparse[u] = q.size 134 q.dense[q.size] = u 135 q.size++ 136 } 137 138 func newQueue(size int) (q *queueOnePass) { 139 return &queueOnePass{ 140 sparse: make([]uint32, size), 141 dense: make([]uint32, size), 142 } 143 } 144 145 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result, 146 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index 147 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a 148 // NextIp array with the single element mergeFailed is returned. 149 // The code assumes that both inputs contain ordered and non-intersecting rune pairs. 150 const mergeFailed = uint32(0xffffffff) 151 152 var ( 153 noRune = []rune{} 154 noNext = []uint32{mergeFailed} 155 ) 156 157 func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { 158 leftLen := len(*leftRunes) 159 rightLen := len(*rightRunes) 160 if leftLen&0x1 != 0 || rightLen&0x1 != 0 { 161 panic("mergeRuneSets odd length []rune") 162 } 163 var ( 164 lx, rx int 165 ) 166 merged := make([]rune, 0) 167 next := make([]uint32, 0) 168 ok := true 169 defer func() { 170 if !ok { 171 merged = nil 172 next = nil 173 } 174 }() 175 176 ix := -1 177 extend := func(newLow *int, newArray *[]rune, pc uint32) bool { 178 if ix > 0 && (*newArray)[*newLow] <= merged[ix] { 179 return false 180 } 181 merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) 182 *newLow += 2 183 ix += 2 184 next = append(next, pc) 185 return true 186 } 187 188 for lx < leftLen || rx < rightLen { 189 switch { 190 case rx >= rightLen: 191 ok = extend(&lx, leftRunes, leftPC) 192 case lx >= leftLen: 193 ok = extend(&rx, rightRunes, rightPC) 194 case (*rightRunes)[rx] < (*leftRunes)[lx]: 195 ok = extend(&rx, rightRunes, rightPC) 196 default: 197 ok = extend(&lx, leftRunes, leftPC) 198 } 199 if !ok { 200 return noRune, noNext 201 } 202 } 203 return merged, next 204 } 205 206 // cleanupOnePass drops working memory, and restores certain shortcut instructions. 207 func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { 208 for ix, instOriginal := range original.Inst { 209 switch instOriginal.Op { 210 case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: 211 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: 212 prog.Inst[ix].Next = nil 213 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 214 prog.Inst[ix].Next = nil 215 prog.Inst[ix] = onePassInst{Inst: instOriginal} 216 } 217 } 218 } 219 220 // onePassCopy creates a copy of the original Prog, as we'll be modifying it 221 func onePassCopy(prog *syntax.Prog) *onePassProg { 222 p := &onePassProg{ 223 Start: prog.Start, 224 NumCap: prog.NumCap, 225 Inst: make([]onePassInst, len(prog.Inst)), 226 } 227 for i, inst := range prog.Inst { 228 p.Inst[i] = onePassInst{Inst: inst} 229 } 230 231 // rewrites one or more common Prog constructs that enable some otherwise 232 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at 233 // ip A, that points to ips B & C. 234 // A:BC + B:DA => A:BC + B:CD 235 // A:BC + B:DC => A:DC + B:DC 236 for pc := range p.Inst { 237 switch p.Inst[pc].Op { 238 default: 239 continue 240 case syntax.InstAlt, syntax.InstAltMatch: 241 // A:Bx + B:Ay 242 p_A_Other := &p.Inst[pc].Out 243 p_A_Alt := &p.Inst[pc].Arg 244 // make sure a target is another Alt 245 instAlt := p.Inst[*p_A_Alt] 246 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 247 p_A_Alt, p_A_Other = p_A_Other, p_A_Alt 248 instAlt = p.Inst[*p_A_Alt] 249 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 250 continue 251 } 252 } 253 instOther := p.Inst[*p_A_Other] 254 // Analyzing both legs pointing to Alts is for another day 255 if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { 256 // too complicated 257 continue 258 } 259 // simple empty transition loop 260 // A:BC + B:DA => A:BC + B:DC 261 p_B_Alt := &p.Inst[*p_A_Alt].Out 262 p_B_Other := &p.Inst[*p_A_Alt].Arg 263 patch := false 264 if instAlt.Out == uint32(pc) { 265 patch = true 266 } else if instAlt.Arg == uint32(pc) { 267 patch = true 268 p_B_Alt, p_B_Other = p_B_Other, p_B_Alt 269 } 270 if patch { 271 *p_B_Alt = *p_A_Other 272 } 273 274 // empty transition to common target 275 // A:BC + B:DC => A:DC + B:DC 276 if *p_A_Other == *p_B_Alt { 277 *p_A_Alt = *p_B_Other 278 } 279 } 280 } 281 return p 282 } 283 284 // runeSlice exists to permit sorting the case-folded rune sets. 285 type runeSlice []rune 286 287 func (p runeSlice) Len() int { return len(p) } 288 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 289 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 290 291 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 292 var anyRune = []rune{0, unicode.MaxRune} 293 294 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, 295 // the match engine can always tell which branch to take. The routine may modify 296 // p if it is turned into a onepass Prog. If it isn't possible for this to be a 297 // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive 298 // to the size of the Prog. 299 func makeOnePass(p *onePassProg) *onePassProg { 300 // If the machine is very long, it's not worth the time to check if we can use one pass. 301 if len(p.Inst) >= 1000 { 302 return notOnePass 303 } 304 305 var ( 306 instQueue = newQueue(len(p.Inst)) 307 visitQueue = newQueue(len(p.Inst)) 308 check func(uint32, []bool) bool 309 onePassRunes = make([][]rune, len(p.Inst)) 310 ) 311 312 // check that paths from Alt instructions are unambiguous, and rebuild the new 313 // program as a onepass program 314 check = func(pc uint32, m []bool) (ok bool) { 315 ok = true 316 inst := &p.Inst[pc] 317 if visitQueue.contains(pc) { 318 return 319 } 320 visitQueue.insert(pc) 321 switch inst.Op { 322 case syntax.InstAlt, syntax.InstAltMatch: 323 ok = check(inst.Out, m) && check(inst.Arg, m) 324 // check no-input paths to InstMatch 325 matchOut := m[inst.Out] 326 matchArg := m[inst.Arg] 327 if matchOut && matchArg { 328 ok = false 329 break 330 } 331 // Match on empty goes in inst.Out 332 if matchArg { 333 inst.Out, inst.Arg = inst.Arg, inst.Out 334 matchOut, matchArg = matchArg, matchOut 335 } 336 if matchOut { 337 m[pc] = true 338 inst.Op = syntax.InstAltMatch 339 } 340 341 // build a dispatch operator from the two legs of the alt. 342 onePassRunes[pc], inst.Next = mergeRuneSets( 343 &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) 344 if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { 345 ok = false 346 break 347 } 348 case syntax.InstCapture, syntax.InstNop: 349 ok = check(inst.Out, m) 350 m[pc] = m[inst.Out] 351 // pass matching runes back through these no-ops. 352 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 353 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 354 for i := range inst.Next { 355 inst.Next[i] = inst.Out 356 } 357 case syntax.InstEmptyWidth: 358 ok = check(inst.Out, m) 359 m[pc] = m[inst.Out] 360 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 361 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 362 for i := range inst.Next { 363 inst.Next[i] = inst.Out 364 } 365 case syntax.InstMatch, syntax.InstFail: 366 m[pc] = inst.Op == syntax.InstMatch 367 case syntax.InstRune: 368 m[pc] = false 369 if len(inst.Next) > 0 { 370 break 371 } 372 instQueue.insert(inst.Out) 373 if len(inst.Rune) == 0 { 374 onePassRunes[pc] = []rune{} 375 inst.Next = []uint32{inst.Out} 376 break 377 } 378 runes := make([]rune, 0) 379 if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 380 r0 := inst.Rune[0] 381 runes = append(runes, r0, r0) 382 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 383 runes = append(runes, r1, r1) 384 } 385 sort.Sort(runeSlice(runes)) 386 } else { 387 runes = append(runes, inst.Rune...) 388 } 389 onePassRunes[pc] = runes 390 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 391 for i := range inst.Next { 392 inst.Next[i] = inst.Out 393 } 394 inst.Op = syntax.InstRune 395 case syntax.InstRune1: 396 m[pc] = false 397 if len(inst.Next) > 0 { 398 break 399 } 400 instQueue.insert(inst.Out) 401 runes := []rune{} 402 // expand case-folded runes 403 if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 404 r0 := inst.Rune[0] 405 runes = append(runes, r0, r0) 406 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 407 runes = append(runes, r1, r1) 408 } 409 sort.Sort(runeSlice(runes)) 410 } else { 411 runes = append(runes, inst.Rune[0], inst.Rune[0]) 412 } 413 onePassRunes[pc] = runes 414 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 415 for i := range inst.Next { 416 inst.Next[i] = inst.Out 417 } 418 inst.Op = syntax.InstRune 419 case syntax.InstRuneAny: 420 m[pc] = false 421 if len(inst.Next) > 0 { 422 break 423 } 424 instQueue.insert(inst.Out) 425 onePassRunes[pc] = append([]rune{}, anyRune...) 426 inst.Next = []uint32{inst.Out} 427 case syntax.InstRuneAnyNotNL: 428 m[pc] = false 429 if len(inst.Next) > 0 { 430 break 431 } 432 instQueue.insert(inst.Out) 433 onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) 434 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 435 for i := range inst.Next { 436 inst.Next[i] = inst.Out 437 } 438 } 439 return 440 } 441 442 instQueue.clear() 443 instQueue.insert(uint32(p.Start)) 444 m := make([]bool, len(p.Inst)) 445 for !instQueue.empty() { 446 visitQueue.clear() 447 pc := instQueue.next() 448 if !check(pc, m) { 449 p = notOnePass 450 break 451 } 452 } 453 if p != notOnePass { 454 for i := range p.Inst { 455 p.Inst[i].Rune = onePassRunes[i] 456 } 457 } 458 return p 459 } 460 461 var notOnePass *onePassProg = nil 462 463 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog 464 // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the 465 // Prog cannot be converted. For a one pass prog, the fundamental condition that must 466 // be true is: at any InstAlt, there must be no ambiguity about what branch to take. 467 func compileOnePass(prog *syntax.Prog) (p *onePassProg) { 468 if prog.Start == 0 { 469 return notOnePass 470 } 471 // onepass regexp is anchored 472 if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || 473 syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { 474 return notOnePass 475 } 476 // every instruction leading to InstMatch must be EmptyEndText 477 for _, inst := range prog.Inst { 478 opOut := prog.Inst[inst.Out].Op 479 switch inst.Op { 480 default: 481 if opOut == syntax.InstMatch { 482 return notOnePass 483 } 484 case syntax.InstAlt, syntax.InstAltMatch: 485 if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { 486 return notOnePass 487 } 488 case syntax.InstEmptyWidth: 489 if opOut == syntax.InstMatch { 490 if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { 491 continue 492 } 493 return notOnePass 494 } 495 } 496 } 497 // Creates a slightly optimized copy of the original Prog 498 // that cleans up some Prog idioms that block valid onepass programs 499 p = onePassCopy(prog) 500 501 // checkAmbiguity on InstAlts, build onepass Prog if possible 502 p = makeOnePass(p) 503 504 if p != notOnePass { 505 cleanupOnePass(p, prog) 506 } 507 return p 508 }