github.com/mdempsky/go@v0.0.0-20151201204031-5dd372bd1e70/src/regexp/backtrack.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // backtrack is a regular expression search with submatch 6 // tracking for small regular expressions and texts. It allocates 7 // a bit vector with (length of input) * (length of prog) bits, 8 // to make sure it never explores the same (character position, instruction) 9 // state multiple times. This limits the search to run in time linear in 10 // the length of the test. 11 // 12 // backtrack is a fast replacement for the NFA code on small 13 // regexps when onepass cannot be used. 14 15 package regexp 16 17 import "regexp/syntax" 18 19 // A job is an entry on the backtracker's job stack. It holds 20 // the instruction pc and the position in the input. 21 type job struct { 22 pc uint32 23 arg int 24 pos int 25 } 26 27 const ( 28 visitedBits = 32 29 maxBacktrackProg = 500 // len(prog.Inst) <= max 30 maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits) 31 ) 32 33 // bitState holds state for the backtracker. 34 type bitState struct { 35 prog *syntax.Prog 36 37 end int 38 cap []int 39 input input 40 jobs []job 41 visited []uint32 42 } 43 44 var notBacktrack *bitState = nil 45 46 // maxBitStateLen returns the maximum length of a string to search with 47 // the backtracker using prog. 48 func maxBitStateLen(prog *syntax.Prog) int { 49 if !shouldBacktrack(prog) { 50 return 0 51 } 52 return maxBacktrackVector / len(prog.Inst) 53 } 54 55 // newBitState returns a new bitState for the given prog, 56 // or notBacktrack if the size of the prog exceeds the maximum size that 57 // the backtracker will be run for. 58 func newBitState(prog *syntax.Prog) *bitState { 59 if !shouldBacktrack(prog) { 60 return notBacktrack 61 } 62 return &bitState{ 63 prog: prog, 64 } 65 } 66 67 // shouldBacktrack reports whether the program is too 68 // long for the backtracker to run. 69 func shouldBacktrack(prog *syntax.Prog) bool { 70 return len(prog.Inst) <= maxBacktrackProg 71 } 72 73 // reset resets the state of the backtracker. 74 // end is the end position in the input. 75 // ncap is the number of captures. 76 func (b *bitState) reset(end int, ncap int) { 77 b.end = end 78 79 if cap(b.jobs) == 0 { 80 b.jobs = make([]job, 0, 256) 81 } else { 82 b.jobs = b.jobs[:0] 83 } 84 85 visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits 86 if cap(b.visited) < visitedSize { 87 b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits) 88 } else { 89 b.visited = b.visited[:visitedSize] 90 for i := range b.visited { 91 b.visited[i] = 0 92 } 93 } 94 95 if cap(b.cap) < ncap { 96 b.cap = make([]int, ncap) 97 } else { 98 b.cap = b.cap[:ncap] 99 } 100 for i := range b.cap { 101 b.cap[i] = -1 102 } 103 } 104 105 // shouldVisit reports whether the combination of (pc, pos) has not 106 // been visited yet. 107 func (b *bitState) shouldVisit(pc uint32, pos int) bool { 108 n := uint(int(pc)*(b.end+1) + pos) 109 if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 { 110 return false 111 } 112 b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1)) 113 return true 114 } 115 116 // push pushes (pc, pos, arg) onto the job stack if it should be 117 // visited. 118 func (b *bitState) push(pc uint32, pos int, arg int) { 119 if b.prog.Inst[pc].Op == syntax.InstFail { 120 return 121 } 122 123 // Only check shouldVisit when arg == 0. 124 // When arg > 0, we are continuing a previous visit. 125 if arg == 0 && !b.shouldVisit(pc, pos) { 126 return 127 } 128 129 b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos}) 130 } 131 132 // tryBacktrack runs a backtracking search starting at pos. 133 func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool { 134 longest := m.re.longest 135 m.matched = false 136 137 b.push(pc, pos, 0) 138 for len(b.jobs) > 0 { 139 l := len(b.jobs) - 1 140 // Pop job off the stack. 141 pc := b.jobs[l].pc 142 pos := b.jobs[l].pos 143 arg := b.jobs[l].arg 144 b.jobs = b.jobs[:l] 145 146 // Optimization: rather than push and pop, 147 // code that is going to Push and continue 148 // the loop simply updates ip, p, and arg 149 // and jumps to CheckAndLoop. We have to 150 // do the ShouldVisit check that Push 151 // would have, but we avoid the stack 152 // manipulation. 153 goto Skip 154 CheckAndLoop: 155 if !b.shouldVisit(pc, pos) { 156 continue 157 } 158 Skip: 159 160 inst := b.prog.Inst[pc] 161 162 switch inst.Op { 163 default: 164 panic("bad inst") 165 case syntax.InstFail: 166 panic("unexpected InstFail") 167 case syntax.InstAlt: 168 // Cannot just 169 // b.push(inst.Out, pos, 0) 170 // b.push(inst.Arg, pos, 0) 171 // If during the processing of inst.Out, we encounter 172 // inst.Arg via another path, we want to process it then. 173 // Pushing it here will inhibit that. Instead, re-push 174 // inst with arg==1 as a reminder to push inst.Arg out 175 // later. 176 switch arg { 177 case 0: 178 b.push(pc, pos, 1) 179 pc = inst.Out 180 goto CheckAndLoop 181 case 1: 182 // Finished inst.Out; try inst.Arg. 183 arg = 0 184 pc = inst.Arg 185 goto CheckAndLoop 186 } 187 panic("bad arg in InstAlt") 188 189 case syntax.InstAltMatch: 190 // One opcode consumes runes; the other leads to match. 191 switch b.prog.Inst[inst.Out].Op { 192 case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 193 // inst.Arg is the match. 194 b.push(inst.Arg, pos, 0) 195 pc = inst.Arg 196 pos = b.end 197 goto CheckAndLoop 198 } 199 // inst.Out is the match - non-greedy 200 b.push(inst.Out, b.end, 0) 201 pc = inst.Out 202 goto CheckAndLoop 203 204 case syntax.InstRune: 205 r, width := i.step(pos) 206 if !inst.MatchRune(r) { 207 continue 208 } 209 pos += width 210 pc = inst.Out 211 goto CheckAndLoop 212 213 case syntax.InstRune1: 214 r, width := i.step(pos) 215 if r != inst.Rune[0] { 216 continue 217 } 218 pos += width 219 pc = inst.Out 220 goto CheckAndLoop 221 222 case syntax.InstRuneAnyNotNL: 223 r, width := i.step(pos) 224 if r == '\n' || r == endOfText { 225 continue 226 } 227 pos += width 228 pc = inst.Out 229 goto CheckAndLoop 230 231 case syntax.InstRuneAny: 232 r, width := i.step(pos) 233 if r == endOfText { 234 continue 235 } 236 pos += width 237 pc = inst.Out 238 goto CheckAndLoop 239 240 case syntax.InstCapture: 241 switch arg { 242 case 0: 243 if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) { 244 // Capture pos to register, but save old value. 245 b.push(pc, b.cap[inst.Arg], 1) // come back when we're done. 246 b.cap[inst.Arg] = pos 247 } 248 pc = inst.Out 249 goto CheckAndLoop 250 case 1: 251 // Finished inst.Out; restore the old value. 252 b.cap[inst.Arg] = pos 253 continue 254 255 } 256 panic("bad arg in InstCapture") 257 continue 258 259 case syntax.InstEmptyWidth: 260 if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 { 261 continue 262 } 263 pc = inst.Out 264 goto CheckAndLoop 265 266 case syntax.InstNop: 267 pc = inst.Out 268 goto CheckAndLoop 269 270 case syntax.InstMatch: 271 // We found a match. If the caller doesn't care 272 // where the match is, no point going further. 273 if len(b.cap) == 0 { 274 m.matched = true 275 return m.matched 276 } 277 278 // Record best match so far. 279 // Only need to check end point, because this entire 280 // call is only considering one start position. 281 if len(b.cap) > 1 { 282 b.cap[1] = pos 283 } 284 if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) { 285 copy(m.matchcap, b.cap) 286 } 287 m.matched = true 288 289 // If going for first match, we're done. 290 if !longest { 291 return m.matched 292 } 293 294 // If we used the entire text, no longer match is possible. 295 if pos == b.end { 296 return m.matched 297 } 298 299 // Otherwise, continue on in hope of a longer match. 300 continue 301 } 302 panic("unreachable") 303 } 304 305 return m.matched 306 } 307 308 // backtrack runs a backtracking search of prog on the input starting at pos. 309 func (m *machine) backtrack(i input, pos int, end int, ncap int) bool { 310 if !i.canCheckPrefix() { 311 panic("backtrack called for a RuneReader") 312 } 313 314 startCond := m.re.cond 315 if startCond == ^syntax.EmptyOp(0) { // impossible 316 return false 317 } 318 if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 319 // Anchored match, past beginning of text. 320 return false 321 } 322 323 b := m.b 324 b.reset(end, ncap) 325 326 m.matchcap = m.matchcap[:ncap] 327 for i := range m.matchcap { 328 m.matchcap[i] = -1 329 } 330 331 // Anchored search must start at the beginning of the input 332 if startCond&syntax.EmptyBeginText != 0 { 333 if len(b.cap) > 0 { 334 b.cap[0] = pos 335 } 336 return m.tryBacktrack(b, i, uint32(m.p.Start), pos) 337 } 338 339 // Unanchored search, starting from each possible text position. 340 // Notice that we have to try the empty string at the end of 341 // the text, so the loop condition is pos <= end, not pos < end. 342 // This looks like it's quadratic in the size of the text, 343 // but we are not clearing visited between calls to TrySearch, 344 // so no work is duplicated and it ends up still being linear. 345 width := -1 346 for ; pos <= end && width != 0; pos += width { 347 if len(m.re.prefix) > 0 { 348 // Match requires literal prefix; fast search for it. 349 advance := i.index(m.re, pos) 350 if advance < 0 { 351 return false 352 } 353 pos += advance 354 } 355 356 if len(b.cap) > 0 { 357 b.cap[0] = pos 358 } 359 if m.tryBacktrack(b, i, uint32(m.p.Start), pos) { 360 // Match must be leftmost; done. 361 return true 362 } 363 _, width = i.step(pos) 364 } 365 return false 366 }