github.com/gnolang/gno@v0.0.0-20240520182011-228e9d0192ce/gnovm/stdlibs/regexp/exec.gno (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package regexp
     6  
     7  import (
     8  	"io"
     9  	"regexp/syntax"
    10  )
    11  
    12  // A queue is a 'sparse array' holding pending threads of execution.
    13  // See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
    14  type queue struct {
    15  	sparse []uint32
    16  	dense  []entry
    17  }
    18  
    19  // An entry is an entry on a queue.
    20  // It holds both the instruction pc and the actual thread.
    21  // Some queue entries are just place holders so that the machine
    22  // knows it has considered that pc. Such entries have t == nil.
    23  type entry struct {
    24  	pc uint32
    25  	t  *thread
    26  }
    27  
    28  // A thread is the state of a single path through the machine:
    29  // an instruction and a corresponding capture array.
    30  // See https://swtch.com/~rsc/regexp/regexp2.html
    31  type thread struct {
    32  	inst *syntax.Inst
    33  	cap  []int
    34  }
    35  
    36  // A machine holds all the state during an NFA simulation for p.
    37  type machine struct {
    38  	re       *Regexp      // corresponding Regexp
    39  	p        *syntax.Prog // compiled program
    40  	q0, q1   queue        // two queues for runq, nextq
    41  	pool     []*thread    // pool of available threads
    42  	matched  bool         // whether a match was found
    43  	matchcap []int        // capture information for the match
    44  
    45  	inputs inputs
    46  }
    47  
    48  type inputs struct {
    49  	// cached inputs, to avoid allocation
    50  	bytes  inputBytes
    51  	string inputString
    52  	reader inputReader
    53  }
    54  
    55  func (i *inputs) newBytes(b []byte) input {
    56  	i.bytes.str = b
    57  	return &i.bytes
    58  }
    59  
    60  func (i *inputs) newString(s string) input {
    61  	i.string.str = s
    62  	return &i.string
    63  }
    64  
    65  func (i *inputs) newReader(r io.RuneReader) input {
    66  	i.reader.r = r
    67  	i.reader.atEOT = false
    68  	i.reader.pos = 0
    69  	return &i.reader
    70  }
    71  
    72  func (i *inputs) clear() {
    73  	// We need to clear 1 of these.
    74  	// Avoid the expense of clearing the others (pointer write barrier).
    75  	if i.bytes.str != nil {
    76  		i.bytes.str = nil
    77  	} else if i.reader.r != nil {
    78  		i.reader.r = nil
    79  	} else {
    80  		i.string.str = ""
    81  	}
    82  }
    83  
    84  func (i *inputs) init(r io.RuneReader, b []byte, s string) (input, int) {
    85  	if r != nil {
    86  		return i.newReader(r), 0
    87  	}
    88  	if b != nil {
    89  		return i.newBytes(b), len(b)
    90  	}
    91  	return i.newString(s), len(s)
    92  }
    93  
    94  func (m *machine) init(ncap int) {
    95  	for _, t := range m.pool {
    96  		t.cap = t.cap[:ncap]
    97  	}
    98  	m.matchcap = m.matchcap[:ncap]
    99  }
   100  
   101  // alloc allocates a new thread with the given instruction.
   102  // It uses the free pool if possible.
   103  func (m *machine) alloc(i *syntax.Inst) *thread {
   104  	var t *thread
   105  	if n := len(m.pool); n > 0 {
   106  		t = m.pool[n-1]
   107  		m.pool = m.pool[:n-1]
   108  	} else {
   109  		t = new(thread)
   110  		t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
   111  	}
   112  	t.inst = i
   113  	return t
   114  }
   115  
   116  // A lazyFlag is a lazily-evaluated syntax.EmptyOp,
   117  // for checking zero-width flags like ^ $ \A \z \B \b.
   118  // It records the pair of relevant runes and does not
   119  // determine the implied flags until absolutely necessary
   120  // (most of the time, that means never).
   121  type lazyFlag uint64
   122  
   123  func newLazyFlag(r1, r2 rune) lazyFlag {
   124  	return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2)))
   125  }
   126  
   127  func (f lazyFlag) match(op syntax.EmptyOp) bool {
   128  	if op == 0 {
   129  		return true
   130  	}
   131  	r1 := rune(f >> 32)
   132  	if op&syntax.EmptyBeginLine != 0 {
   133  		if r1 != '\n' && r1 >= 0 {
   134  			return false
   135  		}
   136  		op &^= syntax.EmptyBeginLine
   137  	}
   138  	if op&syntax.EmptyBeginText != 0 {
   139  		if r1 >= 0 {
   140  			return false
   141  		}
   142  		op &^= syntax.EmptyBeginText
   143  	}
   144  	if op == 0 {
   145  		return true
   146  	}
   147  	r2 := rune(f)
   148  	if op&syntax.EmptyEndLine != 0 {
   149  		if r2 != '\n' && r2 >= 0 {
   150  			return false
   151  		}
   152  		op &^= syntax.EmptyEndLine
   153  	}
   154  	if op&syntax.EmptyEndText != 0 {
   155  		if r2 >= 0 {
   156  			return false
   157  		}
   158  		op &^= syntax.EmptyEndText
   159  	}
   160  	if op == 0 {
   161  		return true
   162  	}
   163  	if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) {
   164  		op &^= syntax.EmptyWordBoundary
   165  	} else {
   166  		op &^= syntax.EmptyNoWordBoundary
   167  	}
   168  	return op == 0
   169  }
   170  
   171  // match runs the machine over the input starting at pos.
   172  // It reports whether a match was found.
   173  // If so, m.matchcap holds the submatch information.
   174  func (m *machine) match(i input, pos int) bool {
   175  	startCond := m.re.cond
   176  	if startCond == ^syntax.EmptyOp(0) { // impossible
   177  		return false
   178  	}
   179  	m.matched = false
   180  	for i := range m.matchcap {
   181  		m.matchcap[i] = -1
   182  	}
   183  	runq, nextq := &m.q0, &m.q1
   184  	r, r1 := endOfText, endOfText
   185  	width, width1 := 0, 0
   186  	r, width = i.step(pos)
   187  	if r != endOfText {
   188  		r1, width1 = i.step(pos + width)
   189  	}
   190  	var flag lazyFlag
   191  	if pos == 0 {
   192  		flag = newLazyFlag(-1, r)
   193  	} else {
   194  		flag = i.context(pos)
   195  	}
   196  	for {
   197  		if len(runq.dense) == 0 {
   198  			if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
   199  				// Anchored match, past beginning of text.
   200  				break
   201  			}
   202  			if m.matched {
   203  				// Have match; finished exploring alternatives.
   204  				break
   205  			}
   206  			if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
   207  				// Match requires literal prefix; fast search for it.
   208  				advance := i.index(m.re, pos)
   209  				if advance < 0 {
   210  					break
   211  				}
   212  				pos += advance
   213  				r, width = i.step(pos)
   214  				r1, width1 = i.step(pos + width)
   215  			}
   216  		}
   217  		if !m.matched {
   218  			if len(m.matchcap) > 0 {
   219  				m.matchcap[0] = pos
   220  			}
   221  			m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil)
   222  		}
   223  		flag = newLazyFlag(r, r1)
   224  		m.step(runq, nextq, pos, pos+width, r, &flag)
   225  		if width == 0 {
   226  			break
   227  		}
   228  		if len(m.matchcap) == 0 && m.matched {
   229  			// Found a match and not paying attention
   230  			// to where it is, so any match will do.
   231  			break
   232  		}
   233  		pos += width
   234  		r, width = r1, width1
   235  		if r != endOfText {
   236  			r1, width1 = i.step(pos + width)
   237  		}
   238  		runq, nextq = nextq, runq
   239  	}
   240  	m.clear(nextq)
   241  	return m.matched
   242  }
   243  
   244  // clear frees all threads on the thread queue.
   245  func (m *machine) clear(q *queue) {
   246  	for _, d := range q.dense {
   247  		if d.t != nil {
   248  			m.pool = append(m.pool, d.t)
   249  		}
   250  	}
   251  	q.dense = q.dense[:0]
   252  }
   253  
   254  // step executes one step of the machine, running each of the threads
   255  // on runq and appending new threads to nextq.
   256  // The step processes the rune c (which may be endOfText),
   257  // which starts at position pos and ends at nextPos.
   258  // nextCond gives the setting for the empty-width flags after c.
   259  func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) {
   260  	longest := m.re.longest
   261  	for j := 0; j < len(runq.dense); j++ {
   262  		d := &runq.dense[j]
   263  		t := d.t
   264  		if t == nil {
   265  			continue
   266  		}
   267  		if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
   268  			m.pool = append(m.pool, t)
   269  			continue
   270  		}
   271  		i := t.inst
   272  		add := false
   273  		switch i.Op {
   274  		default:
   275  			panic("bad inst")
   276  
   277  		case syntax.InstMatch:
   278  			if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
   279  				t.cap[1] = pos
   280  				copy(m.matchcap, t.cap)
   281  			}
   282  			if !longest {
   283  				// First-match mode: cut off all lower-priority threads.
   284  				for _, d := range runq.dense[j+1:] {
   285  					if d.t != nil {
   286  						m.pool = append(m.pool, d.t)
   287  					}
   288  				}
   289  				runq.dense = runq.dense[:0]
   290  			}
   291  			m.matched = true
   292  
   293  		case syntax.InstRune:
   294  			add = i.MatchRune(c)
   295  		case syntax.InstRune1:
   296  			add = c == i.Rune[0]
   297  		case syntax.InstRuneAny:
   298  			add = true
   299  		case syntax.InstRuneAnyNotNL:
   300  			add = c != '\n'
   301  		}
   302  		if add {
   303  			t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
   304  		}
   305  		if t != nil {
   306  			m.pool = append(m.pool, t)
   307  		}
   308  	}
   309  	runq.dense = runq.dense[:0]
   310  }
   311  
   312  // add adds an entry to q for pc, unless the q already has such an entry.
   313  // It also recursively adds an entry for all instructions reachable from pc by following
   314  // empty-width conditions satisfied by cond.  pos gives the current position
   315  // in the input.
   316  func (m *machine) add(q *queue, pc uint32, pos int, cap_ []int, cond *lazyFlag, t *thread) *thread {
   317  Again:
   318  	if pc == 0 {
   319  		return t
   320  	}
   321  	if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
   322  		return t
   323  	}
   324  
   325  	j := len(q.dense)
   326  	q.dense = q.dense[:j+1]
   327  	d := &q.dense[j]
   328  	d.t = nil
   329  	d.pc = pc
   330  	q.sparse[pc] = uint32(j)
   331  
   332  	i := &m.p.Inst[pc]
   333  	switch i.Op {
   334  	default:
   335  		panic("unhandled")
   336  	case syntax.InstFail:
   337  		// nothing
   338  	case syntax.InstAlt, syntax.InstAltMatch:
   339  		t = m.add(q, i.Out, pos, cap_, cond, t)
   340  		pc = i.Arg
   341  		goto Again
   342  	case syntax.InstEmptyWidth:
   343  		if cond.match(syntax.EmptyOp(i.Arg)) {
   344  			pc = i.Out
   345  			goto Again
   346  		}
   347  	case syntax.InstNop:
   348  		pc = i.Out
   349  		goto Again
   350  	case syntax.InstCapture:
   351  		if int(i.Arg) < len(cap_) {
   352  			opos := cap_[i.Arg]
   353  			cap_[i.Arg] = pos
   354  			m.add(q, i.Out, pos, cap_, cond, nil)
   355  			cap_[i.Arg] = opos
   356  		} else {
   357  			pc = i.Out
   358  			goto Again
   359  		}
   360  	case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   361  		if t == nil {
   362  			t = m.alloc(i)
   363  		} else {
   364  			t.inst = i
   365  		}
   366  		if len(cap_) > 0 && &t.cap[0] != &cap_[0] {
   367  			copy(t.cap, cap_)
   368  		}
   369  		d.t = t
   370  		t = nil
   371  	}
   372  	return t
   373  }
   374  
   375  type onePassMachine struct {
   376  	inputs   inputs
   377  	matchcap []int
   378  }
   379  
   380  // XXX sync not yet supported
   381  // var onePassPool sync.Pool
   382  
   383  func newOnePassMachine() *onePassMachine {
   384  	// m, ok := onePassPool.Get().(*onePassMachine)
   385  	// if !ok {
   386  	m := new(onePassMachine)
   387  	//}
   388  	return m
   389  }
   390  
   391  func freeOnePassMachine(m *onePassMachine) {
   392  	m.inputs.clear()
   393  	// onePassPool.Put(m)
   394  }
   395  
   396  // doOnePass implements r.doExecute using the one-pass execution engine.
   397  func (re *Regexp) doOnePass(ir io.RuneReader, ib []byte, is string, pos, ncap int, dstCap []int) []int {
   398  	startCond := re.cond
   399  	if startCond == ^syntax.EmptyOp(0) { // impossible
   400  		return nil
   401  	}
   402  
   403  	m := newOnePassMachine()
   404  	if cap(m.matchcap) < ncap {
   405  		m.matchcap = make([]int, ncap)
   406  	} else {
   407  		m.matchcap = m.matchcap[:ncap]
   408  	}
   409  
   410  	matched := false
   411  	for i := range m.matchcap {
   412  		m.matchcap[i] = -1
   413  	}
   414  
   415  	i, _ := m.inputs.init(ir, ib, is)
   416  
   417  	r, r1 := endOfText, endOfText
   418  	width, width1 := 0, 0
   419  	r, width = i.step(pos)
   420  	if r != endOfText {
   421  		r1, width1 = i.step(pos + width)
   422  	}
   423  	var flag lazyFlag
   424  	if pos == 0 {
   425  		flag = newLazyFlag(-1, r)
   426  	} else {
   427  		flag = i.context(pos)
   428  	}
   429  	pc := re.onepass.Start
   430  	inst := re.onepass.Inst[pc]
   431  	// If there is a simple literal prefix, skip over it.
   432  	if pos == 0 && flag.match(syntax.EmptyOp(inst.Arg)) &&
   433  		len(re.prefix) > 0 && i.canCheckPrefix() {
   434  		// Match requires literal prefix; fast search for it.
   435  		if !i.hasPrefix(re) {
   436  			goto Return
   437  		}
   438  		pos += len(re.prefix)
   439  		r, width = i.step(pos)
   440  		r1, width1 = i.step(pos + width)
   441  		flag = i.context(pos)
   442  		pc = int(re.prefixEnd)
   443  	}
   444  	for {
   445  		inst = re.onepass.Inst[pc]
   446  		pc = int(inst.Out)
   447  		switch inst.Op {
   448  		default:
   449  			panic("bad inst")
   450  		case syntax.InstMatch:
   451  			matched = true
   452  			if len(m.matchcap) > 0 {
   453  				m.matchcap[0] = 0
   454  				m.matchcap[1] = pos
   455  			}
   456  			goto Return
   457  		case syntax.InstRune:
   458  			if !inst.MatchRune(r) {
   459  				goto Return
   460  			}
   461  		case syntax.InstRune1:
   462  			if r != inst.Rune[0] {
   463  				goto Return
   464  			}
   465  		case syntax.InstRuneAny:
   466  			// Nothing
   467  		case syntax.InstRuneAnyNotNL:
   468  			if r == '\n' {
   469  				goto Return
   470  			}
   471  		// peek at the input rune to see which branch of the Alt to take
   472  		case syntax.InstAlt, syntax.InstAltMatch:
   473  			pc = int(onePassNext(&inst, r))
   474  			continue
   475  		case syntax.InstFail:
   476  			goto Return
   477  		case syntax.InstNop:
   478  			continue
   479  		case syntax.InstEmptyWidth:
   480  			if !flag.match(syntax.EmptyOp(inst.Arg)) {
   481  				goto Return
   482  			}
   483  			continue
   484  		case syntax.InstCapture:
   485  			if int(inst.Arg) < len(m.matchcap) {
   486  				m.matchcap[inst.Arg] = pos
   487  			}
   488  			continue
   489  		}
   490  		if width == 0 {
   491  			break
   492  		}
   493  		flag = newLazyFlag(r, r1)
   494  		pos += width
   495  		r, width = r1, width1
   496  		if r != endOfText {
   497  			r1, width1 = i.step(pos + width)
   498  		}
   499  	}
   500  
   501  Return:
   502  	if !matched {
   503  		freeOnePassMachine(m)
   504  		return nil
   505  	}
   506  
   507  	dstCap = append(dstCap, m.matchcap...)
   508  	freeOnePassMachine(m)
   509  	return dstCap
   510  }
   511  
   512  // doMatch reports whether either r, b or s match the regexp.
   513  func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool {
   514  	return re.doExecute(r, b, s, 0, 0, nil) != nil
   515  }
   516  
   517  // doExecute finds the leftmost match in the input, appends the position
   518  // of its subexpressions to dstCap and returns dstCap.
   519  //
   520  // nil is returned if no matches are found and non-nil if matches are found.
   521  func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int {
   522  	if dstCap == nil {
   523  		// Make sure 'return dstCap' is non-nil.
   524  		dstCap = arrayNoInts[:0:0]
   525  	}
   526  
   527  	if r == nil && len(b)+len(s) < re.minInputLen {
   528  		return nil
   529  	}
   530  
   531  	if re.onepass != nil {
   532  		return re.doOnePass(r, b, s, pos, ncap, dstCap)
   533  	}
   534  	if r == nil && len(b)+len(s) < re.maxBitStateLen {
   535  		return re.backtrack(b, s, pos, ncap, dstCap)
   536  	}
   537  
   538  	m := re.get()
   539  	i, _ := m.inputs.init(r, b, s)
   540  
   541  	m.init(ncap)
   542  	if !m.match(i, pos) {
   543  		re.put(m)
   544  		return nil
   545  	}
   546  
   547  	dstCap = append(dstCap, m.matchcap...)
   548  	re.put(m)
   549  	return dstCap
   550  }
   551  
   552  // arrayNoInts is returned by doExecute match if nil dstCap is passed
   553  // to it with ncap=0.
   554  var arrayNoInts [0]int