github.com/gnolang/gno@v0.0.0-20240520182011-228e9d0192ce/gnovm/stdlibs/regexp/backtrack.gno (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // backtrack is a regular expression search with submatch
     6  // tracking for small regular expressions and texts. It allocates
     7  // a bit vector with (length of input) * (length of prog) bits,
     8  // to make sure it never explores the same (character position, instruction)
     9  // state multiple times. This limits the search to run in time linear in
    10  // the length of the test.
    11  //
    12  // backtrack is a fast replacement for the NFA code on small
    13  // regexps when onepass cannot be used.
    14  
    15  package regexp
    16  
    17  import (
    18  	"regexp/syntax"
    19  )
    20  
    21  // A job is an entry on the backtracker's job stack. It holds
    22  // the instruction pc and the position in the input.
    23  type job struct {
    24  	pc  uint32
    25  	arg bool
    26  	pos int
    27  }
    28  
    29  const (
    30  	visitedBits        = 32
    31  	maxBacktrackProg   = 500        // len(prog.Inst) <= max
    32  	maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
    33  )
    34  
    35  // bitState holds state for the backtracker.
    36  type bitState struct {
    37  	end      int
    38  	cap      []int
    39  	matchcap []int
    40  	jobs     []job
    41  	visited  []uint32
    42  
    43  	inputs inputs
    44  }
    45  
    46  // XXX sync not yet supported.
    47  // var bitStatePool sync.Pool
    48  
    49  func newBitState() *bitState {
    50  	// b, ok := bitStatePool.Get().(*bitState)
    51  	// if !ok {
    52  	b := new(bitState)
    53  	//}
    54  	return b
    55  }
    56  
    57  func freeBitState(b *bitState) {
    58  	b.inputs.clear()
    59  	// bitStatePool.Put(b)
    60  }
    61  
    62  // maxBitStateLen returns the maximum length of a string to search with
    63  // the backtracker using prog.
    64  func maxBitStateLen(prog *syntax.Prog) int {
    65  	if !shouldBacktrack(prog) {
    66  		return 0
    67  	}
    68  	return maxBacktrackVector / len(prog.Inst)
    69  }
    70  
    71  // shouldBacktrack reports whether the program is too
    72  // long for the backtracker to run.
    73  func shouldBacktrack(prog *syntax.Prog) bool {
    74  	return len(prog.Inst) <= maxBacktrackProg
    75  }
    76  
    77  // reset resets the state of the backtracker.
    78  // end is the end position in the input.
    79  // ncap is the number of captures.
    80  func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) {
    81  	b.end = end
    82  
    83  	if cap(b.jobs) == 0 {
    84  		b.jobs = make([]job, 0, 256)
    85  	} else {
    86  		b.jobs = b.jobs[:0]
    87  	}
    88  
    89  	visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
    90  	if cap(b.visited) < visitedSize {
    91  		b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
    92  	} else {
    93  		b.visited = b.visited[:visitedSize]
    94  		for i := range b.visited {
    95  			b.visited[i] = 0
    96  		}
    97  	}
    98  
    99  	if cap(b.cap) < ncap {
   100  		b.cap = make([]int, ncap)
   101  	} else {
   102  		b.cap = b.cap[:ncap]
   103  	}
   104  	for i := range b.cap {
   105  		b.cap[i] = -1
   106  	}
   107  
   108  	if cap(b.matchcap) < ncap {
   109  		b.matchcap = make([]int, ncap)
   110  	} else {
   111  		b.matchcap = b.matchcap[:ncap]
   112  	}
   113  	for i := range b.matchcap {
   114  		b.matchcap[i] = -1
   115  	}
   116  }
   117  
   118  // shouldVisit reports whether the combination of (pc, pos) has not
   119  // been visited yet.
   120  func (b *bitState) shouldVisit(pc uint32, pos int) bool {
   121  	n := uint(int(pc)*(b.end+1) + pos)
   122  	if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
   123  		return false
   124  	}
   125  	b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
   126  	return true
   127  }
   128  
   129  // push pushes (pc, pos, arg) onto the job stack if it should be
   130  // visited.
   131  func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) {
   132  	// Only check shouldVisit when arg is false.
   133  	// When arg is true, we are continuing a previous visit.
   134  	if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) {
   135  		b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
   136  	}
   137  }
   138  
   139  // tryBacktrack runs a backtracking search starting at pos.
   140  func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
   141  	longest := re.longest
   142  
   143  	b.push(re, pc, pos, false)
   144  	for len(b.jobs) > 0 {
   145  		l := len(b.jobs) - 1
   146  		// Pop job off the stack.
   147  		pc := b.jobs[l].pc
   148  		pos := b.jobs[l].pos
   149  		arg := b.jobs[l].arg
   150  		b.jobs = b.jobs[:l]
   151  
   152  		// Optimization: rather than push and pop,
   153  		// code that is going to Push and continue
   154  		// the loop simply updates ip, p, and arg
   155  		// and jumps to CheckAndLoop. We have to
   156  		// do the ShouldVisit check that Push
   157  		// would have, but we avoid the stack
   158  		// manipulation.
   159  		goto Skip
   160  	CheckAndLoop:
   161  		if !b.shouldVisit(pc, pos) {
   162  			continue
   163  		}
   164  	Skip:
   165  		inst := re.prog.Inst[pc]
   166  
   167  		switch inst.Op {
   168  		default:
   169  			panic("bad inst")
   170  		case syntax.InstFail:
   171  			panic("unexpected InstFail")
   172  		case syntax.InstAlt:
   173  			// Cannot just
   174  			//   b.push(inst.Out, pos, false)
   175  			//   b.push(inst.Arg, pos, false)
   176  			// If during the processing of inst.Out, we encounter
   177  			// inst.Arg via another path, we want to process it then.
   178  			// Pushing it here will inhibit that. Instead, re-push
   179  			// inst with arg==true as a reminder to push inst.Arg out
   180  			// later.
   181  			if arg {
   182  				// Finished inst.Out; try inst.Arg.
   183  				arg = false
   184  				pc = inst.Arg
   185  				goto CheckAndLoop
   186  			} else {
   187  				b.push(re, pc, pos, true)
   188  				pc = inst.Out
   189  				goto CheckAndLoop
   190  			}
   191  
   192  		case syntax.InstAltMatch:
   193  			// One opcode consumes runes; the other leads to match.
   194  			switch re.prog.Inst[inst.Out].Op {
   195  			case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   196  				// inst.Arg is the match.
   197  				b.push(re, inst.Arg, pos, false)
   198  				pc = inst.Arg
   199  				pos = b.end
   200  				goto CheckAndLoop
   201  			}
   202  			// inst.Out is the match - non-greedy
   203  			b.push(re, inst.Out, b.end, false)
   204  			pc = inst.Out
   205  			goto CheckAndLoop
   206  
   207  		case syntax.InstRune:
   208  			r, width := i.step(pos)
   209  			if !inst.MatchRune(r) {
   210  				continue
   211  			}
   212  			pos += width
   213  			pc = inst.Out
   214  			goto CheckAndLoop
   215  
   216  		case syntax.InstRune1:
   217  			r, width := i.step(pos)
   218  			if r != inst.Rune[0] {
   219  				continue
   220  			}
   221  			pos += width
   222  			pc = inst.Out
   223  			goto CheckAndLoop
   224  
   225  		case syntax.InstRuneAnyNotNL:
   226  			r, width := i.step(pos)
   227  			if r == '\n' || r == endOfText {
   228  				continue
   229  			}
   230  			pos += width
   231  			pc = inst.Out
   232  			goto CheckAndLoop
   233  
   234  		case syntax.InstRuneAny:
   235  			r, width := i.step(pos)
   236  			if r == endOfText {
   237  				continue
   238  			}
   239  			pos += width
   240  			pc = inst.Out
   241  			goto CheckAndLoop
   242  
   243  		case syntax.InstCapture:
   244  			if arg {
   245  				// Finished inst.Out; restore the old value.
   246  				b.cap[inst.Arg] = pos
   247  				continue
   248  			} else {
   249  				if inst.Arg < uint32(len(b.cap)) {
   250  					// Capture pos to register, but save old value.
   251  					b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done.
   252  					b.cap[inst.Arg] = pos
   253  				}
   254  				pc = inst.Out
   255  				goto CheckAndLoop
   256  			}
   257  
   258  		case syntax.InstEmptyWidth:
   259  			flag := i.context(pos)
   260  			if !flag.match(syntax.EmptyOp(inst.Arg)) {
   261  				continue
   262  			}
   263  			pc = inst.Out
   264  			goto CheckAndLoop
   265  
   266  		case syntax.InstNop:
   267  			pc = inst.Out
   268  			goto CheckAndLoop
   269  
   270  		case syntax.InstMatch:
   271  			// We found a match. If the caller doesn't care
   272  			// where the match is, no point going further.
   273  			if len(b.cap) == 0 {
   274  				return true
   275  			}
   276  
   277  			// Record best match so far.
   278  			// Only need to check end point, because this entire
   279  			// call is only considering one start position.
   280  			if len(b.cap) > 1 {
   281  				b.cap[1] = pos
   282  			}
   283  			if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) {
   284  				copy(b.matchcap, b.cap)
   285  			}
   286  
   287  			// If going for first match, we're done.
   288  			if !longest {
   289  				return true
   290  			}
   291  
   292  			// If we used the entire text, no longer match is possible.
   293  			if pos == b.end {
   294  				return true
   295  			}
   296  
   297  			// Otherwise, continue on in hope of a longer match.
   298  			continue
   299  		}
   300  	}
   301  
   302  	return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0
   303  }
   304  
   305  // backtrack runs a backtracking search of prog on the input starting at pos.
   306  func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int {
   307  	startCond := re.cond
   308  	if startCond == ^syntax.EmptyOp(0) { // impossible
   309  		return nil
   310  	}
   311  	if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
   312  		// Anchored match, past beginning of text.
   313  		return nil
   314  	}
   315  
   316  	b := newBitState()
   317  	i, end := b.inputs.init(nil, ib, is)
   318  	b.reset(re.prog, end, ncap)
   319  
   320  	// Anchored search must start at the beginning of the input
   321  	if startCond&syntax.EmptyBeginText != 0 {
   322  		if len(b.cap) > 0 {
   323  			b.cap[0] = pos
   324  		}
   325  		if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
   326  			freeBitState(b)
   327  			return nil
   328  		}
   329  	} else {
   330  		// Unanchored search, starting from each possible text position.
   331  		// Notice that we have to try the empty string at the end of
   332  		// the text, so the loop condition is pos <= end, not pos < end.
   333  		// This looks like it's quadratic in the size of the text,
   334  		// but we are not clearing visited between calls to TrySearch,
   335  		// so no work is duplicated and it ends up still being linear.
   336  		width := -1
   337  		for ; pos <= end && width != 0; pos += width {
   338  			if len(re.prefix) > 0 {
   339  				// Match requires literal prefix; fast search for it.
   340  				advance := i.index(re, pos)
   341  				if advance < 0 {
   342  					freeBitState(b)
   343  					return nil
   344  				}
   345  				pos += advance
   346  			}
   347  
   348  			if len(b.cap) > 0 {
   349  				b.cap[0] = pos
   350  			}
   351  			if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
   352  				// Match must be leftmost; done.
   353  				goto Match
   354  			}
   355  			_, width = i.step(pos)
   356  		}
   357  		freeBitState(b)
   358  		return nil
   359  	}
   360  
   361  Match:
   362  	dstCap = append(dstCap, b.matchcap...)
   363  	freeBitState(b)
   364  	return dstCap
   365  }