rsc.io/go@v0.0.0-20150416155037-e040fd465409/src/regexp/backtrack.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // backtrack is a regular expression search with submatch 6 // tracking for small regular expressions and texts. It allocates 7 // a bit vector with (length of input) * (length of prog) bits, 8 // to make sure it never explores the same (character position, instruction) 9 // state multiple times. This limits the search to run in time linear in 10 // the length of the test. 11 // 12 // backtrack is a fast replacement for the NFA code on small 13 // regexps when onepass cannot be used. 14 15 package regexp 16 17 import "regexp/syntax" 18 19 // A job is an entry on the backtracker's job stack. It holds 20 // the instruction pc and the position in the input. 21 type job struct { 22 pc uint32 23 arg int 24 pos int 25 } 26 27 const ( 28 visitedBits = 32 29 maxBacktrackProg = 500 // len(prog.Inst) <= max 30 maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits) 31 ) 32 33 // bitState holds state for the backtracker. 34 type bitState struct { 35 prog *syntax.Prog 36 37 end int 38 cap []int 39 reqcap bool // whether any captures are requested 40 input input 41 jobs []job 42 visited []uint32 43 } 44 45 var notBacktrack *bitState = nil 46 47 // maxBitStateLen returns the maximum length of a string to search with 48 // the backtracker using prog. 49 func maxBitStateLen(prog *syntax.Prog) int { 50 if !shouldBacktrack(prog) { 51 return 0 52 } 53 return maxBacktrackVector / len(prog.Inst) 54 } 55 56 // newBitState returns a new bitState for the given prog, 57 // or notBacktrack if the size of the prog exceeds the maximum size that 58 // the backtracker will be run for. 59 func newBitState(prog *syntax.Prog) *bitState { 60 if !shouldBacktrack(prog) { 61 return notBacktrack 62 } 63 return &bitState{ 64 prog: prog, 65 } 66 } 67 68 // shouldBacktrack reports whether the program is too 69 // long for the backtracker to run. 70 func shouldBacktrack(prog *syntax.Prog) bool { 71 return len(prog.Inst) <= maxBacktrackProg 72 } 73 74 // reset resets the state of the backtracker. 75 // end is the end position in the input. ncap and reqcap are the number 76 // of the machine's capture registers and the number of user-requested 77 // captures respectively. 78 func (b *bitState) reset(end int, ncap int, reqcap int) { 79 b.end = end 80 b.reqcap = reqcap > 0 81 82 if cap(b.jobs) == 0 { 83 b.jobs = make([]job, 0, 256) 84 } else { 85 b.jobs = b.jobs[:0] 86 } 87 88 visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits 89 if cap(b.visited) < visitedSize { 90 b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits) 91 } else { 92 b.visited = b.visited[:visitedSize] 93 for i := range b.visited { 94 b.visited[i] = 0 95 } 96 } 97 98 if len(b.cap) < ncap { 99 b.cap = make([]int, ncap) 100 } 101 for i := range b.cap { 102 b.cap[i] = -1 103 } 104 } 105 106 // shouldVisit reports whether the combination of (pc, pos) has not 107 // been visited yet. 108 func (b *bitState) shouldVisit(pc uint32, pos int) bool { 109 n := uint(int(pc)*(b.end+1) + pos) 110 if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 { 111 return false 112 } 113 b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1)) 114 return true 115 } 116 117 // push pushes (pc, pos, arg) onto the job stack if it should be 118 // visited. 119 func (b *bitState) push(pc uint32, pos int, arg int) { 120 if b.prog.Inst[pc].Op == syntax.InstFail { 121 return 122 } 123 124 // Only check shouldVisit when arg == 0. 125 // When arg > 0, we are continuing a previous visit. 126 if arg == 0 && !b.shouldVisit(pc, pos) { 127 return 128 } 129 130 b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos}) 131 } 132 133 // tryBacktrack runs a backtracking search starting at pos. 134 func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool { 135 longest := m.re.longest 136 m.matched = false 137 138 b.push(pc, pos, 0) 139 for len(b.jobs) > 0 { 140 l := len(b.jobs) - 1 141 // Pop job off the stack. 142 pc := b.jobs[l].pc 143 pos := b.jobs[l].pos 144 arg := b.jobs[l].arg 145 b.jobs = b.jobs[:l] 146 147 // Optimization: rather than push and pop, 148 // code that is going to Push and continue 149 // the loop simply updates ip, p, and arg 150 // and jumps to CheckAndLoop. We have to 151 // do the ShouldVisit check that Push 152 // would have, but we avoid the stack 153 // manipulation. 154 goto Skip 155 CheckAndLoop: 156 if !b.shouldVisit(pc, pos) { 157 continue 158 } 159 Skip: 160 161 inst := b.prog.Inst[pc] 162 163 switch inst.Op { 164 default: 165 panic("bad inst") 166 case syntax.InstFail: 167 panic("unexpected InstFail") 168 case syntax.InstAlt: 169 // Cannot just 170 // b.push(inst.Out, pos, 0) 171 // b.push(inst.Arg, pos, 0) 172 // If during the processing of inst.Out, we encounter 173 // inst.Arg via another path, we want to process it then. 174 // Pushing it here will inhibit that. Instead, re-push 175 // inst with arg==1 as a reminder to push inst.Arg out 176 // later. 177 switch arg { 178 case 0: 179 b.push(pc, pos, 1) 180 pc = inst.Out 181 goto CheckAndLoop 182 case 1: 183 // Finished inst.Out; try inst.Arg. 184 arg = 0 185 pc = inst.Arg 186 goto CheckAndLoop 187 } 188 panic("bad arg in InstAlt") 189 190 case syntax.InstAltMatch: 191 // One opcode consumes runes; the other leads to match. 192 switch b.prog.Inst[inst.Out].Op { 193 case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 194 // inst.Arg is the match. 195 b.push(inst.Arg, pos, 0) 196 pc = inst.Arg 197 pos = b.end 198 goto CheckAndLoop 199 } 200 // inst.Out is the match - non-greedy 201 b.push(inst.Out, b.end, 0) 202 pc = inst.Out 203 goto CheckAndLoop 204 205 case syntax.InstRune: 206 r, width := i.step(pos) 207 if !inst.MatchRune(r) { 208 continue 209 } 210 pos += width 211 pc = inst.Out 212 goto CheckAndLoop 213 214 case syntax.InstRune1: 215 r, width := i.step(pos) 216 if r != inst.Rune[0] { 217 continue 218 } 219 pos += width 220 pc = inst.Out 221 goto CheckAndLoop 222 223 case syntax.InstRuneAnyNotNL: 224 r, width := i.step(pos) 225 if r == '\n' || r == endOfText { 226 continue 227 } 228 pos += width 229 pc = inst.Out 230 goto CheckAndLoop 231 232 case syntax.InstRuneAny: 233 r, width := i.step(pos) 234 if r == endOfText { 235 continue 236 } 237 pos += width 238 pc = inst.Out 239 goto CheckAndLoop 240 241 case syntax.InstCapture: 242 switch arg { 243 case 0: 244 if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) { 245 // Capture pos to register, but save old value. 246 b.push(pc, b.cap[inst.Arg], 1) // come back when we're done. 247 b.cap[inst.Arg] = pos 248 } 249 pc = inst.Out 250 goto CheckAndLoop 251 case 1: 252 // Finished inst.Out; restore the old value. 253 b.cap[inst.Arg] = pos 254 continue 255 256 } 257 panic("bad arg in InstCapture") 258 continue 259 260 case syntax.InstEmptyWidth: 261 if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 { 262 continue 263 } 264 pc = inst.Out 265 goto CheckAndLoop 266 267 case syntax.InstNop: 268 pc = inst.Out 269 goto CheckAndLoop 270 271 case syntax.InstMatch: 272 // We found a match. If the caller doesn't care 273 // where the match is, no point going further. 274 if !b.reqcap { 275 m.matched = true 276 return m.matched 277 } 278 279 // Record best match so far. 280 // Only need to check end point, because this entire 281 // call is only considering one start position. 282 b.cap[1] = pos 283 if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) { 284 copy(m.matchcap, b.cap) 285 } 286 m.matched = true 287 288 // If going for first match, we're done. 289 if !longest { 290 return m.matched 291 } 292 293 // If we used the entire text, no longer match is possible. 294 if pos == b.end { 295 return m.matched 296 } 297 298 // Otherwise, continue on in hope of a longer match. 299 continue 300 } 301 panic("unreachable") 302 } 303 304 return m.matched 305 } 306 307 // backtrack runs a backtracking search of prog on the input starting at pos. 308 func (m *machine) backtrack(i input, pos int, end int, reqcap int) bool { 309 if !i.canCheckPrefix() { 310 panic("backtrack called for a RuneReader") 311 } 312 313 startCond := m.re.cond 314 if startCond == ^syntax.EmptyOp(0) { // impossible 315 return false 316 } 317 if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 318 // Anchored match, past beginning of text. 319 return false 320 } 321 322 b := m.b 323 b.reset(end, len(m.matchcap), reqcap) 324 325 for i := range m.matchcap { 326 m.matchcap[i] = -1 327 } 328 329 // Anchored search must start at the beginning of the input 330 if startCond&syntax.EmptyBeginText != 0 { 331 b.cap[0] = pos 332 return m.tryBacktrack(b, i, uint32(m.p.Start), pos) 333 } 334 335 // Unanchored search, starting from each possible text position. 336 // Notice that we have to try the empty string at the end of 337 // the text, so the loop condition is pos <= end, not pos < end. 338 // This looks like it's quadratic in the size of the text, 339 // but we are not clearing visited between calls to TrySearch, 340 // so no work is duplicated and it ends up still being linear. 341 width := -1 342 for ; pos <= end && width != 0; pos += width { 343 if len(m.re.prefix) > 0 { 344 // Match requires literal prefix; fast search for it. 345 advance := i.index(m.re, pos) 346 if advance < 0 { 347 return false 348 } 349 pos += advance 350 } 351 352 b.cap[0] = pos 353 if m.tryBacktrack(b, i, uint32(m.p.Start), pos) { 354 // Match must be leftmost; done. 355 return true 356 } 357 _, width = i.step(pos) 358 } 359 return false 360 }