github.com/bgentry/go@v0.0.0-20150121062915-6cf5a733d54d/src/regexp/exec.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package regexp 6 7 import ( 8 "io" 9 "regexp/syntax" 10 ) 11 12 // A queue is a 'sparse array' holding pending threads of execution. 13 // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html 14 type queue struct { 15 sparse []uint32 16 dense []entry 17 } 18 19 // A entry is an entry on a queue. 20 // It holds both the instruction pc and the actual thread. 21 // Some queue entries are just place holders so that the machine 22 // knows it has considered that pc. Such entries have t == nil. 23 type entry struct { 24 pc uint32 25 t *thread 26 } 27 28 // A thread is the state of a single path through the machine: 29 // an instruction and a corresponding capture array. 30 // See http://swtch.com/~rsc/regexp/regexp2.html 31 type thread struct { 32 inst *syntax.Inst 33 cap []int 34 } 35 36 // A machine holds all the state during an NFA simulation for p. 37 type machine struct { 38 re *Regexp // corresponding Regexp 39 p *syntax.Prog // compiled program 40 op *onePassProg // compiled onepass program, or notOnePass 41 q0, q1 queue // two queues for runq, nextq 42 pool []*thread // pool of available threads 43 matched bool // whether a match was found 44 matchcap []int // capture information for the match 45 46 // cached inputs, to avoid allocation 47 inputBytes inputBytes 48 inputString inputString 49 inputReader inputReader 50 } 51 52 func (m *machine) newInputBytes(b []byte) input { 53 m.inputBytes.str = b 54 return &m.inputBytes 55 } 56 57 func (m *machine) newInputString(s string) input { 58 m.inputString.str = s 59 return &m.inputString 60 } 61 62 func (m *machine) newInputReader(r io.RuneReader) input { 63 m.inputReader.r = r 64 m.inputReader.atEOT = false 65 m.inputReader.pos = 0 66 return &m.inputReader 67 } 68 69 // progMachine returns a new machine running the prog p. 70 func progMachine(p *syntax.Prog, op *onePassProg) *machine { 71 m := &machine{p: p, op: op} 72 n := len(m.p.Inst) 73 m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 74 m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 75 ncap := p.NumCap 76 if ncap < 2 { 77 ncap = 2 78 } 79 m.matchcap = make([]int, ncap) 80 return m 81 } 82 83 func (m *machine) init(ncap int) { 84 for _, t := range m.pool { 85 t.cap = t.cap[:ncap] 86 } 87 m.matchcap = m.matchcap[:ncap] 88 } 89 90 // alloc allocates a new thread with the given instruction. 91 // It uses the free pool if possible. 92 func (m *machine) alloc(i *syntax.Inst) *thread { 93 var t *thread 94 if n := len(m.pool); n > 0 { 95 t = m.pool[n-1] 96 m.pool = m.pool[:n-1] 97 } else { 98 t = new(thread) 99 t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) 100 } 101 t.inst = i 102 return t 103 } 104 105 // free returns t to the free pool. 106 func (m *machine) free(t *thread) { 107 m.inputBytes.str = nil 108 m.inputString.str = "" 109 m.inputReader.r = nil 110 m.pool = append(m.pool, t) 111 } 112 113 // match runs the machine over the input starting at pos. 114 // It reports whether a match was found. 115 // If so, m.matchcap holds the submatch information. 116 func (m *machine) match(i input, pos int) bool { 117 startCond := m.re.cond 118 if startCond == ^syntax.EmptyOp(0) { // impossible 119 return false 120 } 121 m.matched = false 122 for i := range m.matchcap { 123 m.matchcap[i] = -1 124 } 125 runq, nextq := &m.q0, &m.q1 126 r, r1 := endOfText, endOfText 127 width, width1 := 0, 0 128 r, width = i.step(pos) 129 if r != endOfText { 130 r1, width1 = i.step(pos + width) 131 } 132 var flag syntax.EmptyOp 133 if pos == 0 { 134 flag = syntax.EmptyOpContext(-1, r) 135 } else { 136 flag = i.context(pos) 137 } 138 for { 139 if len(runq.dense) == 0 { 140 if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 141 // Anchored match, past beginning of text. 142 break 143 } 144 if m.matched { 145 // Have match; finished exploring alternatives. 146 break 147 } 148 if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() { 149 // Match requires literal prefix; fast search for it. 150 advance := i.index(m.re, pos) 151 if advance < 0 { 152 break 153 } 154 pos += advance 155 r, width = i.step(pos) 156 r1, width1 = i.step(pos + width) 157 } 158 } 159 if !m.matched { 160 if len(m.matchcap) > 0 { 161 m.matchcap[0] = pos 162 } 163 m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil) 164 } 165 flag = syntax.EmptyOpContext(r, r1) 166 m.step(runq, nextq, pos, pos+width, r, flag) 167 if width == 0 { 168 break 169 } 170 if len(m.matchcap) == 0 && m.matched { 171 // Found a match and not paying attention 172 // to where it is, so any match will do. 173 break 174 } 175 pos += width 176 r, width = r1, width1 177 if r != endOfText { 178 r1, width1 = i.step(pos + width) 179 } 180 runq, nextq = nextq, runq 181 } 182 m.clear(nextq) 183 return m.matched 184 } 185 186 // clear frees all threads on the thread queue. 187 func (m *machine) clear(q *queue) { 188 for _, d := range q.dense { 189 if d.t != nil { 190 // m.free(d.t) 191 m.pool = append(m.pool, d.t) 192 } 193 } 194 q.dense = q.dense[:0] 195 } 196 197 // step executes one step of the machine, running each of the threads 198 // on runq and appending new threads to nextq. 199 // The step processes the rune c (which may be endOfText), 200 // which starts at position pos and ends at nextPos. 201 // nextCond gives the setting for the empty-width flags after c. 202 func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) { 203 longest := m.re.longest 204 for j := 0; j < len(runq.dense); j++ { 205 d := &runq.dense[j] 206 t := d.t 207 if t == nil { 208 continue 209 } 210 if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { 211 // m.free(t) 212 m.pool = append(m.pool, t) 213 continue 214 } 215 i := t.inst 216 add := false 217 switch i.Op { 218 default: 219 panic("bad inst") 220 221 case syntax.InstMatch: 222 if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { 223 t.cap[1] = pos 224 copy(m.matchcap, t.cap) 225 } 226 if !longest { 227 // First-match mode: cut off all lower-priority threads. 228 for _, d := range runq.dense[j+1:] { 229 if d.t != nil { 230 // m.free(d.t) 231 m.pool = append(m.pool, d.t) 232 } 233 } 234 runq.dense = runq.dense[:0] 235 } 236 m.matched = true 237 238 case syntax.InstRune: 239 add = i.MatchRune(c) 240 case syntax.InstRune1: 241 add = c == i.Rune[0] 242 case syntax.InstRuneAny: 243 add = true 244 case syntax.InstRuneAnyNotNL: 245 add = c != '\n' 246 } 247 if add { 248 t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) 249 } 250 if t != nil { 251 // m.free(t) 252 m.pool = append(m.pool, t) 253 } 254 } 255 runq.dense = runq.dense[:0] 256 } 257 258 // add adds an entry to q for pc, unless the q already has such an entry. 259 // It also recursively adds an entry for all instructions reachable from pc by following 260 // empty-width conditions satisfied by cond. pos gives the current position 261 // in the input. 262 func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread { 263 if pc == 0 { 264 return t 265 } 266 if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { 267 return t 268 } 269 270 j := len(q.dense) 271 q.dense = q.dense[:j+1] 272 d := &q.dense[j] 273 d.t = nil 274 d.pc = pc 275 q.sparse[pc] = uint32(j) 276 277 i := &m.p.Inst[pc] 278 switch i.Op { 279 default: 280 panic("unhandled") 281 case syntax.InstFail: 282 // nothing 283 case syntax.InstAlt, syntax.InstAltMatch: 284 t = m.add(q, i.Out, pos, cap, cond, t) 285 t = m.add(q, i.Arg, pos, cap, cond, t) 286 case syntax.InstEmptyWidth: 287 if syntax.EmptyOp(i.Arg)&^cond == 0 { 288 t = m.add(q, i.Out, pos, cap, cond, t) 289 } 290 case syntax.InstNop: 291 t = m.add(q, i.Out, pos, cap, cond, t) 292 case syntax.InstCapture: 293 if int(i.Arg) < len(cap) { 294 opos := cap[i.Arg] 295 cap[i.Arg] = pos 296 m.add(q, i.Out, pos, cap, cond, nil) 297 cap[i.Arg] = opos 298 } else { 299 t = m.add(q, i.Out, pos, cap, cond, t) 300 } 301 case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 302 if t == nil { 303 t = m.alloc(i) 304 } else { 305 t.inst = i 306 } 307 if len(cap) > 0 && &t.cap[0] != &cap[0] { 308 copy(t.cap, cap) 309 } 310 d.t = t 311 t = nil 312 } 313 return t 314 } 315 316 // onepass runs the machine over the input starting at pos. 317 // It reports whether a match was found. 318 // If so, m.matchcap holds the submatch information. 319 func (m *machine) onepass(i input, pos int) bool { 320 startCond := m.re.cond 321 if startCond == ^syntax.EmptyOp(0) { // impossible 322 return false 323 } 324 m.matched = false 325 for i := range m.matchcap { 326 m.matchcap[i] = -1 327 } 328 r, r1 := endOfText, endOfText 329 width, width1 := 0, 0 330 r, width = i.step(pos) 331 if r != endOfText { 332 r1, width1 = i.step(pos + width) 333 } 334 var flag syntax.EmptyOp 335 if pos == 0 { 336 flag = syntax.EmptyOpContext(-1, r) 337 } else { 338 flag = i.context(pos) 339 } 340 pc := m.op.Start 341 inst := m.op.Inst[pc] 342 // If there is a simple literal prefix, skip over it. 343 if pos == 0 && syntax.EmptyOp(inst.Arg)&^flag == 0 && 344 len(m.re.prefix) > 0 && i.canCheckPrefix() { 345 // Match requires literal prefix; fast search for it. 346 if i.hasPrefix(m.re) { 347 pos += len(m.re.prefix) 348 r, width = i.step(pos) 349 r1, width1 = i.step(pos + width) 350 flag = i.context(pos) 351 pc = int(m.re.prefixEnd) 352 } else { 353 return m.matched 354 } 355 } 356 for { 357 inst = m.op.Inst[pc] 358 pc = int(inst.Out) 359 switch inst.Op { 360 default: 361 panic("bad inst") 362 case syntax.InstMatch: 363 m.matched = true 364 if len(m.matchcap) > 0 { 365 m.matchcap[0] = 0 366 m.matchcap[1] = pos 367 } 368 return m.matched 369 case syntax.InstRune: 370 if !inst.MatchRune(r) { 371 return m.matched 372 } 373 case syntax.InstRune1: 374 if r != inst.Rune[0] { 375 return m.matched 376 } 377 case syntax.InstRuneAny: 378 // Nothing 379 case syntax.InstRuneAnyNotNL: 380 if r == '\n' { 381 return m.matched 382 } 383 // peek at the input rune to see which branch of the Alt to take 384 case syntax.InstAlt, syntax.InstAltMatch: 385 pc = int(onePassNext(&inst, r)) 386 continue 387 case syntax.InstFail: 388 return m.matched 389 case syntax.InstNop: 390 continue 391 case syntax.InstEmptyWidth: 392 if syntax.EmptyOp(inst.Arg)&^flag != 0 { 393 return m.matched 394 } 395 continue 396 case syntax.InstCapture: 397 if int(inst.Arg) < len(m.matchcap) { 398 m.matchcap[inst.Arg] = pos 399 } 400 continue 401 } 402 if width == 0 { 403 break 404 } 405 flag = syntax.EmptyOpContext(r, r1) 406 pos += width 407 r, width = r1, width1 408 if r != endOfText { 409 r1, width1 = i.step(pos + width) 410 } 411 } 412 return m.matched 413 } 414 415 // empty is a non-nil 0-element slice, 416 // so doExecute can avoid an allocation 417 // when 0 captures are requested from a successful match. 418 var empty = make([]int, 0) 419 420 // doExecute finds the leftmost match in the input and returns 421 // the position of its subexpressions. 422 func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int) []int { 423 m := re.get() 424 var i input 425 if r != nil { 426 i = m.newInputReader(r) 427 } else if b != nil { 428 i = m.newInputBytes(b) 429 } else { 430 i = m.newInputString(s) 431 } 432 if m.op != notOnePass { 433 if !m.onepass(i, pos) { 434 re.put(m) 435 return nil 436 } 437 } else { 438 m.init(ncap) 439 if !m.match(i, pos) { 440 re.put(m) 441 return nil 442 } 443 } 444 if ncap == 0 { 445 re.put(m) 446 return empty // empty but not nil 447 } 448 cap := make([]int, len(m.matchcap)) 449 copy(cap, m.matchcap) 450 re.put(m) 451 return cap 452 }