github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/regexp/exec.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package regexp 6 7 import ( 8 "io" 9 "regexp/syntax" 10 ) 11 12 // A queue is a 'sparse array' holding pending threads of execution. 13 // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html 14 type queue struct { 15 sparse []uint32 16 dense []entry 17 } 18 19 // A entry is an entry on a queue. 20 // It holds both the instruction pc and the actual thread. 21 // Some queue entries are just place holders so that the machine 22 // knows it has considered that pc. Such entries have t == nil. 23 type entry struct { 24 pc uint32 25 t *thread 26 } 27 28 // A thread is the state of a single path through the machine: 29 // an instruction and a corresponding capture array. 30 // See http://swtch.com/~rsc/regexp/regexp2.html 31 type thread struct { 32 inst *syntax.Inst 33 cap []int 34 } 35 36 // A machine holds all the state during an NFA simulation for p. 37 type machine struct { 38 re *Regexp // corresponding Regexp 39 p *syntax.Prog // compiled program 40 op *onePassProg // compiled onepass program, or notOnePass 41 maxBitStateLen int // max length of string to search with bitstate 42 b *bitState // state for backtracker, allocated lazily 43 q0, q1 queue // two queues for runq, nextq 44 pool []*thread // pool of available threads 45 matched bool // whether a match was found 46 matchcap []int // capture information for the match 47 48 // cached inputs, to avoid allocation 49 inputBytes inputBytes 50 inputString inputString 51 inputReader inputReader 52 } 53 54 func (m *machine) newInputBytes(b []byte) input { 55 m.inputBytes.str = b 56 return &m.inputBytes 57 } 58 59 func (m *machine) newInputString(s string) input { 60 m.inputString.str = s 61 return &m.inputString 62 } 63 64 func (m *machine) newInputReader(r io.RuneReader) input { 65 m.inputReader.r = r 66 m.inputReader.atEOT = false 67 m.inputReader.pos = 0 68 return &m.inputReader 69 } 70 71 // progMachine returns a new machine running the prog p. 72 func progMachine(p *syntax.Prog, op *onePassProg) *machine { 73 m := &machine{p: p, op: op} 74 n := len(m.p.Inst) 75 m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 76 m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 77 ncap := p.NumCap 78 if ncap < 2 { 79 ncap = 2 80 } 81 if op == notOnePass { 82 m.maxBitStateLen = maxBitStateLen(p) 83 } 84 m.matchcap = make([]int, ncap) 85 return m 86 } 87 88 func (m *machine) init(ncap int) { 89 for _, t := range m.pool { 90 t.cap = t.cap[:ncap] 91 } 92 m.matchcap = m.matchcap[:ncap] 93 } 94 95 // alloc allocates a new thread with the given instruction. 96 // It uses the free pool if possible. 97 func (m *machine) alloc(i *syntax.Inst) *thread { 98 var t *thread 99 if n := len(m.pool); n > 0 { 100 t = m.pool[n-1] 101 m.pool = m.pool[:n-1] 102 } else { 103 t = new(thread) 104 t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) 105 } 106 t.inst = i 107 return t 108 } 109 110 // match runs the machine over the input starting at pos. 111 // It reports whether a match was found. 112 // If so, m.matchcap holds the submatch information. 113 func (m *machine) match(i input, pos int) bool { 114 startCond := m.re.cond 115 if startCond == ^syntax.EmptyOp(0) { // impossible 116 return false 117 } 118 m.matched = false 119 for i := range m.matchcap { 120 m.matchcap[i] = -1 121 } 122 runq, nextq := &m.q0, &m.q1 123 r, r1 := endOfText, endOfText 124 width, width1 := 0, 0 125 r, width = i.step(pos) 126 if r != endOfText { 127 r1, width1 = i.step(pos + width) 128 } 129 var flag syntax.EmptyOp 130 if pos == 0 { 131 flag = syntax.EmptyOpContext(-1, r) 132 } else { 133 flag = i.context(pos) 134 } 135 for { 136 if len(runq.dense) == 0 { 137 if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 138 // Anchored match, past beginning of text. 139 break 140 } 141 if m.matched { 142 // Have match; finished exploring alternatives. 143 break 144 } 145 if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() { 146 // Match requires literal prefix; fast search for it. 147 advance := i.index(m.re, pos) 148 if advance < 0 { 149 break 150 } 151 pos += advance 152 r, width = i.step(pos) 153 r1, width1 = i.step(pos + width) 154 } 155 } 156 if !m.matched { 157 if len(m.matchcap) > 0 { 158 m.matchcap[0] = pos 159 } 160 m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil) 161 } 162 flag = syntax.EmptyOpContext(r, r1) 163 m.step(runq, nextq, pos, pos+width, r, flag) 164 if width == 0 { 165 break 166 } 167 if len(m.matchcap) == 0 && m.matched { 168 // Found a match and not paying attention 169 // to where it is, so any match will do. 170 break 171 } 172 pos += width 173 r, width = r1, width1 174 if r != endOfText { 175 r1, width1 = i.step(pos + width) 176 } 177 runq, nextq = nextq, runq 178 } 179 m.clear(nextq) 180 return m.matched 181 } 182 183 // clear frees all threads on the thread queue. 184 func (m *machine) clear(q *queue) { 185 for _, d := range q.dense { 186 if d.t != nil { 187 m.pool = append(m.pool, d.t) 188 } 189 } 190 q.dense = q.dense[:0] 191 } 192 193 // step executes one step of the machine, running each of the threads 194 // on runq and appending new threads to nextq. 195 // The step processes the rune c (which may be endOfText), 196 // which starts at position pos and ends at nextPos. 197 // nextCond gives the setting for the empty-width flags after c. 198 func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) { 199 longest := m.re.longest 200 for j := 0; j < len(runq.dense); j++ { 201 d := &runq.dense[j] 202 t := d.t 203 if t == nil { 204 continue 205 } 206 if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { 207 m.pool = append(m.pool, t) 208 continue 209 } 210 i := t.inst 211 add := false 212 switch i.Op { 213 default: 214 panic("bad inst") 215 216 case syntax.InstMatch: 217 if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { 218 t.cap[1] = pos 219 copy(m.matchcap, t.cap) 220 } 221 if !longest { 222 // First-match mode: cut off all lower-priority threads. 223 for _, d := range runq.dense[j+1:] { 224 if d.t != nil { 225 m.pool = append(m.pool, d.t) 226 } 227 } 228 runq.dense = runq.dense[:0] 229 } 230 m.matched = true 231 232 case syntax.InstRune: 233 add = i.MatchRune(c) 234 case syntax.InstRune1: 235 add = c == i.Rune[0] 236 case syntax.InstRuneAny: 237 add = true 238 case syntax.InstRuneAnyNotNL: 239 add = c != '\n' 240 } 241 if add { 242 t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) 243 } 244 if t != nil { 245 m.pool = append(m.pool, t) 246 } 247 } 248 runq.dense = runq.dense[:0] 249 } 250 251 // add adds an entry to q for pc, unless the q already has such an entry. 252 // It also recursively adds an entry for all instructions reachable from pc by following 253 // empty-width conditions satisfied by cond. pos gives the current position 254 // in the input. 255 func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread { 256 if pc == 0 { 257 return t 258 } 259 if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { 260 return t 261 } 262 263 j := len(q.dense) 264 q.dense = q.dense[:j+1] 265 d := &q.dense[j] 266 d.t = nil 267 d.pc = pc 268 q.sparse[pc] = uint32(j) 269 270 i := &m.p.Inst[pc] 271 switch i.Op { 272 default: 273 panic("unhandled") 274 case syntax.InstFail: 275 // nothing 276 case syntax.InstAlt, syntax.InstAltMatch: 277 t = m.add(q, i.Out, pos, cap, cond, t) 278 t = m.add(q, i.Arg, pos, cap, cond, t) 279 case syntax.InstEmptyWidth: 280 if syntax.EmptyOp(i.Arg)&^cond == 0 { 281 t = m.add(q, i.Out, pos, cap, cond, t) 282 } 283 case syntax.InstNop: 284 t = m.add(q, i.Out, pos, cap, cond, t) 285 case syntax.InstCapture: 286 if int(i.Arg) < len(cap) { 287 opos := cap[i.Arg] 288 cap[i.Arg] = pos 289 m.add(q, i.Out, pos, cap, cond, nil) 290 cap[i.Arg] = opos 291 } else { 292 t = m.add(q, i.Out, pos, cap, cond, t) 293 } 294 case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 295 if t == nil { 296 t = m.alloc(i) 297 } else { 298 t.inst = i 299 } 300 if len(cap) > 0 && &t.cap[0] != &cap[0] { 301 copy(t.cap, cap) 302 } 303 d.t = t 304 t = nil 305 } 306 return t 307 } 308 309 // onepass runs the machine over the input starting at pos. 310 // It reports whether a match was found. 311 // If so, m.matchcap holds the submatch information. 312 // ncap is the number of captures. 313 func (m *machine) onepass(i input, pos, ncap int) bool { 314 startCond := m.re.cond 315 if startCond == ^syntax.EmptyOp(0) { // impossible 316 return false 317 } 318 m.matched = false 319 m.matchcap = m.matchcap[:ncap] 320 for i := range m.matchcap { 321 m.matchcap[i] = -1 322 } 323 r, r1 := endOfText, endOfText 324 width, width1 := 0, 0 325 r, width = i.step(pos) 326 if r != endOfText { 327 r1, width1 = i.step(pos + width) 328 } 329 var flag syntax.EmptyOp 330 if pos == 0 { 331 flag = syntax.EmptyOpContext(-1, r) 332 } else { 333 flag = i.context(pos) 334 } 335 pc := m.op.Start 336 inst := m.op.Inst[pc] 337 // If there is a simple literal prefix, skip over it. 338 if pos == 0 && syntax.EmptyOp(inst.Arg)&^flag == 0 && 339 len(m.re.prefix) > 0 && i.canCheckPrefix() { 340 // Match requires literal prefix; fast search for it. 341 if i.hasPrefix(m.re) { 342 pos += len(m.re.prefix) 343 r, width = i.step(pos) 344 r1, width1 = i.step(pos + width) 345 flag = i.context(pos) 346 pc = int(m.re.prefixEnd) 347 } else { 348 return m.matched 349 } 350 } 351 for { 352 inst = m.op.Inst[pc] 353 pc = int(inst.Out) 354 switch inst.Op { 355 default: 356 panic("bad inst") 357 case syntax.InstMatch: 358 m.matched = true 359 if len(m.matchcap) > 0 { 360 m.matchcap[0] = 0 361 m.matchcap[1] = pos 362 } 363 return m.matched 364 case syntax.InstRune: 365 if !inst.MatchRune(r) { 366 return m.matched 367 } 368 case syntax.InstRune1: 369 if r != inst.Rune[0] { 370 return m.matched 371 } 372 case syntax.InstRuneAny: 373 // Nothing 374 case syntax.InstRuneAnyNotNL: 375 if r == '\n' { 376 return m.matched 377 } 378 // peek at the input rune to see which branch of the Alt to take 379 case syntax.InstAlt, syntax.InstAltMatch: 380 pc = int(onePassNext(&inst, r)) 381 continue 382 case syntax.InstFail: 383 return m.matched 384 case syntax.InstNop: 385 continue 386 case syntax.InstEmptyWidth: 387 if syntax.EmptyOp(inst.Arg)&^flag != 0 { 388 return m.matched 389 } 390 continue 391 case syntax.InstCapture: 392 if int(inst.Arg) < len(m.matchcap) { 393 m.matchcap[inst.Arg] = pos 394 } 395 continue 396 } 397 if width == 0 { 398 break 399 } 400 flag = syntax.EmptyOpContext(r, r1) 401 pos += width 402 r, width = r1, width1 403 if r != endOfText { 404 r1, width1 = i.step(pos + width) 405 } 406 } 407 return m.matched 408 } 409 410 // doMatch reports whether either r, b or s match the regexp. 411 func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool { 412 return re.doExecute(r, b, s, 0, 0, nil) != nil 413 } 414 415 // doExecute finds the leftmost match in the input, appends the position 416 // of its subexpressions to dstCap and returns dstCap. 417 // 418 // nil is returned if no matches are found and non-nil if matches are found. 419 func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int { 420 m := re.get() 421 var i input 422 var size int 423 if r != nil { 424 i = m.newInputReader(r) 425 } else if b != nil { 426 i = m.newInputBytes(b) 427 size = len(b) 428 } else { 429 i = m.newInputString(s) 430 size = len(s) 431 } 432 if m.op != notOnePass { 433 if !m.onepass(i, pos, ncap) { 434 re.put(m) 435 return nil 436 } 437 } else if size < m.maxBitStateLen && r == nil { 438 if m.b == nil { 439 m.b = newBitState(m.p) 440 } 441 if !m.backtrack(i, pos, size, ncap) { 442 re.put(m) 443 return nil 444 } 445 } else { 446 m.init(ncap) 447 if !m.match(i, pos) { 448 re.put(m) 449 return nil 450 } 451 } 452 dstCap = append(dstCap, m.matchcap...) 453 if dstCap == nil { 454 // Keep the promise of returning non-nil value on match. 455 dstCap = arrayNoInts[:0] 456 } 457 re.put(m) 458 return dstCap 459 } 460 461 // arrayNoInts is returned by doExecute match if nil dstCap is passed 462 // to it with ncap=0. 463 var arrayNoInts [0]int