github.com/sbinet/go@v0.0.0-20160827155028-54d7de7dd62b/src/regexp/backtrack.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // backtrack is a regular expression search with submatch 6 // tracking for small regular expressions and texts. It allocates 7 // a bit vector with (length of input) * (length of prog) bits, 8 // to make sure it never explores the same (character position, instruction) 9 // state multiple times. This limits the search to run in time linear in 10 // the length of the test. 11 // 12 // backtrack is a fast replacement for the NFA code on small 13 // regexps when onepass cannot be used. 14 15 package regexp 16 17 import "regexp/syntax" 18 19 // A job is an entry on the backtracker's job stack. It holds 20 // the instruction pc and the position in the input. 21 type job struct { 22 pc uint32 23 arg int 24 pos int 25 } 26 27 const ( 28 visitedBits = 32 29 maxBacktrackProg = 500 // len(prog.Inst) <= max 30 maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits) 31 ) 32 33 // bitState holds state for the backtracker. 34 type bitState struct { 35 prog *syntax.Prog 36 37 end int 38 cap []int 39 jobs []job 40 visited []uint32 41 } 42 43 var notBacktrack *bitState = nil 44 45 // maxBitStateLen returns the maximum length of a string to search with 46 // the backtracker using prog. 47 func maxBitStateLen(prog *syntax.Prog) int { 48 if !shouldBacktrack(prog) { 49 return 0 50 } 51 return maxBacktrackVector / len(prog.Inst) 52 } 53 54 // newBitState returns a new bitState for the given prog, 55 // or notBacktrack if the size of the prog exceeds the maximum size that 56 // the backtracker will be run for. 57 func newBitState(prog *syntax.Prog) *bitState { 58 if !shouldBacktrack(prog) { 59 return notBacktrack 60 } 61 return &bitState{ 62 prog: prog, 63 } 64 } 65 66 // shouldBacktrack reports whether the program is too 67 // long for the backtracker to run. 68 func shouldBacktrack(prog *syntax.Prog) bool { 69 return len(prog.Inst) <= maxBacktrackProg 70 } 71 72 // reset resets the state of the backtracker. 73 // end is the end position in the input. 74 // ncap is the number of captures. 75 func (b *bitState) reset(end int, ncap int) { 76 b.end = end 77 78 if cap(b.jobs) == 0 { 79 b.jobs = make([]job, 0, 256) 80 } else { 81 b.jobs = b.jobs[:0] 82 } 83 84 visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits 85 if cap(b.visited) < visitedSize { 86 b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits) 87 } else { 88 b.visited = b.visited[:visitedSize] 89 for i := range b.visited { 90 b.visited[i] = 0 91 } 92 } 93 94 if cap(b.cap) < ncap { 95 b.cap = make([]int, ncap) 96 } else { 97 b.cap = b.cap[:ncap] 98 } 99 for i := range b.cap { 100 b.cap[i] = -1 101 } 102 } 103 104 // shouldVisit reports whether the combination of (pc, pos) has not 105 // been visited yet. 106 func (b *bitState) shouldVisit(pc uint32, pos int) bool { 107 n := uint(int(pc)*(b.end+1) + pos) 108 if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 { 109 return false 110 } 111 b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1)) 112 return true 113 } 114 115 // push pushes (pc, pos, arg) onto the job stack if it should be 116 // visited. 117 func (b *bitState) push(pc uint32, pos int, arg int) { 118 if b.prog.Inst[pc].Op == syntax.InstFail { 119 return 120 } 121 122 // Only check shouldVisit when arg == 0. 123 // When arg > 0, we are continuing a previous visit. 124 if arg == 0 && !b.shouldVisit(pc, pos) { 125 return 126 } 127 128 b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos}) 129 } 130 131 // tryBacktrack runs a backtracking search starting at pos. 132 func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool { 133 longest := m.re.longest 134 m.matched = false 135 136 b.push(pc, pos, 0) 137 for len(b.jobs) > 0 { 138 l := len(b.jobs) - 1 139 // Pop job off the stack. 140 pc := b.jobs[l].pc 141 pos := b.jobs[l].pos 142 arg := b.jobs[l].arg 143 b.jobs = b.jobs[:l] 144 145 // Optimization: rather than push and pop, 146 // code that is going to Push and continue 147 // the loop simply updates ip, p, and arg 148 // and jumps to CheckAndLoop. We have to 149 // do the ShouldVisit check that Push 150 // would have, but we avoid the stack 151 // manipulation. 152 goto Skip 153 CheckAndLoop: 154 if !b.shouldVisit(pc, pos) { 155 continue 156 } 157 Skip: 158 159 inst := b.prog.Inst[pc] 160 161 switch inst.Op { 162 default: 163 panic("bad inst") 164 case syntax.InstFail: 165 panic("unexpected InstFail") 166 case syntax.InstAlt: 167 // Cannot just 168 // b.push(inst.Out, pos, 0) 169 // b.push(inst.Arg, pos, 0) 170 // If during the processing of inst.Out, we encounter 171 // inst.Arg via another path, we want to process it then. 172 // Pushing it here will inhibit that. Instead, re-push 173 // inst with arg==1 as a reminder to push inst.Arg out 174 // later. 175 switch arg { 176 case 0: 177 b.push(pc, pos, 1) 178 pc = inst.Out 179 goto CheckAndLoop 180 case 1: 181 // Finished inst.Out; try inst.Arg. 182 arg = 0 183 pc = inst.Arg 184 goto CheckAndLoop 185 } 186 panic("bad arg in InstAlt") 187 188 case syntax.InstAltMatch: 189 // One opcode consumes runes; the other leads to match. 190 switch b.prog.Inst[inst.Out].Op { 191 case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 192 // inst.Arg is the match. 193 b.push(inst.Arg, pos, 0) 194 pc = inst.Arg 195 pos = b.end 196 goto CheckAndLoop 197 } 198 // inst.Out is the match - non-greedy 199 b.push(inst.Out, b.end, 0) 200 pc = inst.Out 201 goto CheckAndLoop 202 203 case syntax.InstRune: 204 r, width := i.step(pos) 205 if !inst.MatchRune(r) { 206 continue 207 } 208 pos += width 209 pc = inst.Out 210 goto CheckAndLoop 211 212 case syntax.InstRune1: 213 r, width := i.step(pos) 214 if r != inst.Rune[0] { 215 continue 216 } 217 pos += width 218 pc = inst.Out 219 goto CheckAndLoop 220 221 case syntax.InstRuneAnyNotNL: 222 r, width := i.step(pos) 223 if r == '\n' || r == endOfText { 224 continue 225 } 226 pos += width 227 pc = inst.Out 228 goto CheckAndLoop 229 230 case syntax.InstRuneAny: 231 r, width := i.step(pos) 232 if r == endOfText { 233 continue 234 } 235 pos += width 236 pc = inst.Out 237 goto CheckAndLoop 238 239 case syntax.InstCapture: 240 switch arg { 241 case 0: 242 if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) { 243 // Capture pos to register, but save old value. 244 b.push(pc, b.cap[inst.Arg], 1) // come back when we're done. 245 b.cap[inst.Arg] = pos 246 } 247 pc = inst.Out 248 goto CheckAndLoop 249 case 1: 250 // Finished inst.Out; restore the old value. 251 b.cap[inst.Arg] = pos 252 continue 253 254 } 255 panic("bad arg in InstCapture") 256 257 case syntax.InstEmptyWidth: 258 if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 { 259 continue 260 } 261 pc = inst.Out 262 goto CheckAndLoop 263 264 case syntax.InstNop: 265 pc = inst.Out 266 goto CheckAndLoop 267 268 case syntax.InstMatch: 269 // We found a match. If the caller doesn't care 270 // where the match is, no point going further. 271 if len(b.cap) == 0 { 272 m.matched = true 273 return m.matched 274 } 275 276 // Record best match so far. 277 // Only need to check end point, because this entire 278 // call is only considering one start position. 279 if len(b.cap) > 1 { 280 b.cap[1] = pos 281 } 282 if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) { 283 copy(m.matchcap, b.cap) 284 } 285 m.matched = true 286 287 // If going for first match, we're done. 288 if !longest { 289 return m.matched 290 } 291 292 // If we used the entire text, no longer match is possible. 293 if pos == b.end { 294 return m.matched 295 } 296 297 // Otherwise, continue on in hope of a longer match. 298 continue 299 } 300 } 301 302 return m.matched 303 } 304 305 // backtrack runs a backtracking search of prog on the input starting at pos. 306 func (m *machine) backtrack(i input, pos int, end int, ncap int) bool { 307 if !i.canCheckPrefix() { 308 panic("backtrack called for a RuneReader") 309 } 310 311 startCond := m.re.cond 312 if startCond == ^syntax.EmptyOp(0) { // impossible 313 return false 314 } 315 if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 316 // Anchored match, past beginning of text. 317 return false 318 } 319 320 b := m.b 321 b.reset(end, ncap) 322 323 m.matchcap = m.matchcap[:ncap] 324 for i := range m.matchcap { 325 m.matchcap[i] = -1 326 } 327 328 // Anchored search must start at the beginning of the input 329 if startCond&syntax.EmptyBeginText != 0 { 330 if len(b.cap) > 0 { 331 b.cap[0] = pos 332 } 333 return m.tryBacktrack(b, i, uint32(m.p.Start), pos) 334 } 335 336 // Unanchored search, starting from each possible text position. 337 // Notice that we have to try the empty string at the end of 338 // the text, so the loop condition is pos <= end, not pos < end. 339 // This looks like it's quadratic in the size of the text, 340 // but we are not clearing visited between calls to TrySearch, 341 // so no work is duplicated and it ends up still being linear. 342 width := -1 343 for ; pos <= end && width != 0; pos += width { 344 if len(m.re.prefix) > 0 { 345 // Match requires literal prefix; fast search for it. 346 advance := i.index(m.re, pos) 347 if advance < 0 { 348 return false 349 } 350 pos += advance 351 } 352 353 if len(b.cap) > 0 { 354 b.cap[0] = pos 355 } 356 if m.tryBacktrack(b, i, uint32(m.p.Start), pos) { 357 // Match must be leftmost; done. 358 return true 359 } 360 _, width = i.step(pos) 361 } 362 return false 363 }