github.com/peggyl/go@v0.0.0-20151008231540-ae315999c2d5/src/regexp/onepass.go (about)

     1  // Copyright 2014 The Go Authors.  All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package regexp
     6  
     7  import (
     8  	"bytes"
     9  	"regexp/syntax"
    10  	"sort"
    11  	"unicode"
    12  )
    13  
    14  // "One-pass" regexp execution.
    15  // Some regexps can be analyzed to determine that they never need
    16  // backtracking: they are guaranteed to run in one pass over the string
    17  // without bothering to save all the usual NFA state.
    18  // Detect those and execute them more quickly.
    19  
    20  // A onePassProg is a compiled one-pass regular expression program.
    21  // It is the same as syntax.Prog except for the use of onePassInst.
    22  type onePassProg struct {
    23  	Inst   []onePassInst
    24  	Start  int // index of start instruction
    25  	NumCap int // number of InstCapture insts in re
    26  }
    27  
    28  // A onePassInst is a single instruction in a one-pass regular expression program.
    29  // It is the same as syntax.Inst except for the new 'Next' field.
    30  type onePassInst struct {
    31  	syntax.Inst
    32  	Next []uint32
    33  }
    34  
    35  // OnePassPrefix returns a literal string that all matches for the
    36  // regexp must start with.  Complete is true if the prefix
    37  // is the entire match. Pc is the index of the last rune instruction
    38  // in the string. The OnePassPrefix skips over the mandatory
    39  // EmptyBeginText
    40  func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
    41  	i := &p.Inst[p.Start]
    42  	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
    43  		return "", i.Op == syntax.InstMatch, uint32(p.Start)
    44  	}
    45  	pc = i.Out
    46  	i = &p.Inst[pc]
    47  	for i.Op == syntax.InstNop {
    48  		pc = i.Out
    49  		i = &p.Inst[pc]
    50  	}
    51  	// Avoid allocation of buffer if prefix is empty.
    52  	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
    53  		return "", i.Op == syntax.InstMatch, uint32(p.Start)
    54  	}
    55  
    56  	// Have prefix; gather characters.
    57  	var buf bytes.Buffer
    58  	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
    59  		buf.WriteRune(i.Rune[0])
    60  		pc, i = i.Out, &p.Inst[i.Out]
    61  	}
    62  	return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc
    63  }
    64  
    65  // OnePassNext selects the next actionable state of the prog, based on the input character.
    66  // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
    67  // One of the alternates may ultimately lead without input to end of line. If the instruction
    68  // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
    69  func onePassNext(i *onePassInst, r rune) uint32 {
    70  	next := i.MatchRunePos(r)
    71  	if next >= 0 {
    72  		return i.Next[next]
    73  	}
    74  	if i.Op == syntax.InstAltMatch {
    75  		return i.Out
    76  	}
    77  	return 0
    78  }
    79  
    80  func iop(i *syntax.Inst) syntax.InstOp {
    81  	op := i.Op
    82  	switch op {
    83  	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
    84  		op = syntax.InstRune
    85  	}
    86  	return op
    87  }
    88  
    89  // Sparse Array implementation is used as a queueOnePass.
    90  type queueOnePass struct {
    91  	sparse          []uint32
    92  	dense           []uint32
    93  	size, nextIndex uint32
    94  }
    95  
    96  func (q *queueOnePass) empty() bool {
    97  	return q.nextIndex >= q.size
    98  }
    99  
   100  func (q *queueOnePass) next() (n uint32) {
   101  	n = q.dense[q.nextIndex]
   102  	q.nextIndex++
   103  	return
   104  }
   105  
   106  func (q *queueOnePass) clear() {
   107  	q.size = 0
   108  	q.nextIndex = 0
   109  }
   110  
   111  func (q *queueOnePass) reset() {
   112  	q.nextIndex = 0
   113  }
   114  
   115  func (q *queueOnePass) contains(u uint32) bool {
   116  	if u >= uint32(len(q.sparse)) {
   117  		return false
   118  	}
   119  	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
   120  }
   121  
   122  func (q *queueOnePass) insert(u uint32) {
   123  	if !q.contains(u) {
   124  		q.insertNew(u)
   125  	}
   126  }
   127  
   128  func (q *queueOnePass) insertNew(u uint32) {
   129  	if u >= uint32(len(q.sparse)) {
   130  		return
   131  	}
   132  	q.sparse[u] = q.size
   133  	q.dense[q.size] = u
   134  	q.size++
   135  }
   136  
   137  func newQueue(size int) (q *queueOnePass) {
   138  	return &queueOnePass{
   139  		sparse: make([]uint32, size),
   140  		dense:  make([]uint32, size),
   141  	}
   142  }
   143  
   144  // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
   145  // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
   146  // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
   147  // NextIp array with the single element mergeFailed is returned.
   148  // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
   149  const mergeFailed = uint32(0xffffffff)
   150  
   151  var (
   152  	noRune = []rune{}
   153  	noNext = []uint32{mergeFailed}
   154  )
   155  
   156  func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
   157  	leftLen := len(*leftRunes)
   158  	rightLen := len(*rightRunes)
   159  	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
   160  		panic("mergeRuneSets odd length []rune")
   161  	}
   162  	var (
   163  		lx, rx int
   164  	)
   165  	merged := make([]rune, 0)
   166  	next := make([]uint32, 0)
   167  	ok := true
   168  	defer func() {
   169  		if !ok {
   170  			merged = nil
   171  			next = nil
   172  		}
   173  	}()
   174  
   175  	ix := -1
   176  	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
   177  		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
   178  			return false
   179  		}
   180  		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
   181  		*newLow += 2
   182  		ix += 2
   183  		next = append(next, pc)
   184  		return true
   185  	}
   186  
   187  	for lx < leftLen || rx < rightLen {
   188  		switch {
   189  		case rx >= rightLen:
   190  			ok = extend(&lx, leftRunes, leftPC)
   191  		case lx >= leftLen:
   192  			ok = extend(&rx, rightRunes, rightPC)
   193  		case (*rightRunes)[rx] < (*leftRunes)[lx]:
   194  			ok = extend(&rx, rightRunes, rightPC)
   195  		default:
   196  			ok = extend(&lx, leftRunes, leftPC)
   197  		}
   198  		if !ok {
   199  			return noRune, noNext
   200  		}
   201  	}
   202  	return merged, next
   203  }
   204  
   205  // cleanupOnePass drops working memory, and restores certain shortcut instructions.
   206  func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
   207  	for ix, instOriginal := range original.Inst {
   208  		switch instOriginal.Op {
   209  		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
   210  		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
   211  			prog.Inst[ix].Next = nil
   212  		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   213  			prog.Inst[ix].Next = nil
   214  			prog.Inst[ix] = onePassInst{Inst: instOriginal}
   215  		}
   216  	}
   217  }
   218  
   219  // onePassCopy creates a copy of the original Prog, as we'll be modifying it
   220  func onePassCopy(prog *syntax.Prog) *onePassProg {
   221  	p := &onePassProg{
   222  		Start:  prog.Start,
   223  		NumCap: prog.NumCap,
   224  	}
   225  	for _, inst := range prog.Inst {
   226  		p.Inst = append(p.Inst, onePassInst{Inst: inst})
   227  	}
   228  
   229  	// rewrites one or more common Prog constructs that enable some otherwise
   230  	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
   231  	// ip A, that points to ips B & C.
   232  	// A:BC + B:DA => A:BC + B:CD
   233  	// A:BC + B:DC => A:DC + B:DC
   234  	for pc := range p.Inst {
   235  		switch p.Inst[pc].Op {
   236  		default:
   237  			continue
   238  		case syntax.InstAlt, syntax.InstAltMatch:
   239  			// A:Bx + B:Ay
   240  			p_A_Other := &p.Inst[pc].Out
   241  			p_A_Alt := &p.Inst[pc].Arg
   242  			// make sure a target is another Alt
   243  			instAlt := p.Inst[*p_A_Alt]
   244  			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
   245  				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
   246  				instAlt = p.Inst[*p_A_Alt]
   247  				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
   248  					continue
   249  				}
   250  			}
   251  			instOther := p.Inst[*p_A_Other]
   252  			// Analyzing both legs pointing to Alts is for another day
   253  			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
   254  				// too complicated
   255  				continue
   256  			}
   257  			// simple empty transition loop
   258  			// A:BC + B:DA => A:BC + B:DC
   259  			p_B_Alt := &p.Inst[*p_A_Alt].Out
   260  			p_B_Other := &p.Inst[*p_A_Alt].Arg
   261  			patch := false
   262  			if instAlt.Out == uint32(pc) {
   263  				patch = true
   264  			} else if instAlt.Arg == uint32(pc) {
   265  				patch = true
   266  				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
   267  			}
   268  			if patch {
   269  				*p_B_Alt = *p_A_Other
   270  			}
   271  
   272  			// empty transition to common target
   273  			// A:BC + B:DC => A:DC + B:DC
   274  			if *p_A_Other == *p_B_Alt {
   275  				*p_A_Alt = *p_B_Other
   276  			}
   277  		}
   278  	}
   279  	return p
   280  }
   281  
   282  // runeSlice exists to permit sorting the case-folded rune sets.
   283  type runeSlice []rune
   284  
   285  func (p runeSlice) Len() int           { return len(p) }
   286  func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
   287  func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
   288  
   289  // Sort is a convenience method.
   290  func (p runeSlice) Sort() {
   291  	sort.Sort(p)
   292  }
   293  
   294  var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
   295  var anyRune = []rune{0, unicode.MaxRune}
   296  
   297  // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
   298  // the match engine can always tell which branch to take. The routine may modify
   299  // p if it is turned into a onepass Prog. If it isn't possible for this to be a
   300  // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
   301  // to the size of the Prog.
   302  func makeOnePass(p *onePassProg) *onePassProg {
   303  	// If the machine is very long, it's not worth the time to check if we can use one pass.
   304  	if len(p.Inst) >= 1000 {
   305  		return notOnePass
   306  	}
   307  
   308  	var (
   309  		instQueue    = newQueue(len(p.Inst))
   310  		visitQueue   = newQueue(len(p.Inst))
   311  		build        func(uint32, *queueOnePass)
   312  		check        func(uint32, map[uint32]bool) bool
   313  		onePassRunes = make([][]rune, len(p.Inst))
   314  	)
   315  	build = func(pc uint32, q *queueOnePass) {
   316  		if q.contains(pc) {
   317  			return
   318  		}
   319  		inst := p.Inst[pc]
   320  		switch inst.Op {
   321  		case syntax.InstAlt, syntax.InstAltMatch:
   322  			q.insert(inst.Out)
   323  			build(inst.Out, q)
   324  			q.insert(inst.Arg)
   325  		case syntax.InstMatch, syntax.InstFail:
   326  		default:
   327  			q.insert(inst.Out)
   328  		}
   329  	}
   330  
   331  	// check that paths from Alt instructions are unambiguous, and rebuild the new
   332  	// program as a onepass program
   333  	check = func(pc uint32, m map[uint32]bool) (ok bool) {
   334  		ok = true
   335  		inst := &p.Inst[pc]
   336  		if visitQueue.contains(pc) {
   337  			return
   338  		}
   339  		visitQueue.insert(pc)
   340  		switch inst.Op {
   341  		case syntax.InstAlt, syntax.InstAltMatch:
   342  			ok = check(inst.Out, m) && check(inst.Arg, m)
   343  			// check no-input paths to InstMatch
   344  			matchOut := m[inst.Out]
   345  			matchArg := m[inst.Arg]
   346  			if matchOut && matchArg {
   347  				ok = false
   348  				break
   349  			}
   350  			// Match on empty goes in inst.Out
   351  			if matchArg {
   352  				inst.Out, inst.Arg = inst.Arg, inst.Out
   353  				matchOut, matchArg = matchArg, matchOut
   354  			}
   355  			if matchOut {
   356  				m[pc] = true
   357  				inst.Op = syntax.InstAltMatch
   358  			}
   359  
   360  			// build a dispatch operator from the two legs of the alt.
   361  			onePassRunes[pc], inst.Next = mergeRuneSets(
   362  				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
   363  			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
   364  				ok = false
   365  				break
   366  			}
   367  		case syntax.InstCapture, syntax.InstNop:
   368  			ok = check(inst.Out, m)
   369  			m[pc] = m[inst.Out]
   370  			// pass matching runes back through these no-ops.
   371  			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
   372  			inst.Next = []uint32{}
   373  			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   374  				inst.Next = append(inst.Next, inst.Out)
   375  			}
   376  		case syntax.InstEmptyWidth:
   377  			ok = check(inst.Out, m)
   378  			m[pc] = m[inst.Out]
   379  			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
   380  			inst.Next = []uint32{}
   381  			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   382  				inst.Next = append(inst.Next, inst.Out)
   383  			}
   384  		case syntax.InstMatch, syntax.InstFail:
   385  			m[pc] = inst.Op == syntax.InstMatch
   386  			break
   387  		case syntax.InstRune:
   388  			ok = check(inst.Out, m)
   389  			m[pc] = false
   390  			if len(inst.Next) > 0 {
   391  				break
   392  			}
   393  			if len(inst.Rune) == 0 {
   394  				onePassRunes[pc] = []rune{}
   395  				inst.Next = []uint32{inst.Out}
   396  				break
   397  			}
   398  			runes := make([]rune, 0)
   399  			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
   400  				r0 := inst.Rune[0]
   401  				runes = append(runes, r0, r0)
   402  				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   403  					runes = append(runes, r1, r1)
   404  				}
   405  				sort.Sort(runeSlice(runes))
   406  			} else {
   407  				runes = append(runes, inst.Rune...)
   408  			}
   409  			onePassRunes[pc] = runes
   410  			inst.Next = []uint32{}
   411  			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   412  				inst.Next = append(inst.Next, inst.Out)
   413  			}
   414  			inst.Op = syntax.InstRune
   415  		case syntax.InstRune1:
   416  			ok = check(inst.Out, m)
   417  			m[pc] = false
   418  			if len(inst.Next) > 0 {
   419  				break
   420  			}
   421  			runes := []rune{}
   422  			// expand case-folded runes
   423  			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
   424  				r0 := inst.Rune[0]
   425  				runes = append(runes, r0, r0)
   426  				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   427  					runes = append(runes, r1, r1)
   428  				}
   429  				sort.Sort(runeSlice(runes))
   430  			} else {
   431  				runes = append(runes, inst.Rune[0], inst.Rune[0])
   432  			}
   433  			onePassRunes[pc] = runes
   434  			inst.Next = []uint32{}
   435  			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   436  				inst.Next = append(inst.Next, inst.Out)
   437  			}
   438  			inst.Op = syntax.InstRune
   439  		case syntax.InstRuneAny:
   440  			ok = check(inst.Out, m)
   441  			m[pc] = false
   442  			if len(inst.Next) > 0 {
   443  				break
   444  			}
   445  			onePassRunes[pc] = append([]rune{}, anyRune...)
   446  			inst.Next = []uint32{inst.Out}
   447  		case syntax.InstRuneAnyNotNL:
   448  			ok = check(inst.Out, m)
   449  			m[pc] = false
   450  			if len(inst.Next) > 0 {
   451  				break
   452  			}
   453  			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
   454  			inst.Next = []uint32{}
   455  			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   456  				inst.Next = append(inst.Next, inst.Out)
   457  			}
   458  		}
   459  		return
   460  	}
   461  
   462  	instQueue.clear()
   463  	instQueue.insert(uint32(p.Start))
   464  	m := make(map[uint32]bool, len(p.Inst))
   465  	for !instQueue.empty() {
   466  		pc := instQueue.next()
   467  		inst := p.Inst[pc]
   468  		visitQueue.clear()
   469  		if !check(uint32(pc), m) {
   470  			p = notOnePass
   471  			break
   472  		}
   473  		switch inst.Op {
   474  		case syntax.InstAlt, syntax.InstAltMatch:
   475  			instQueue.insert(inst.Out)
   476  			instQueue.insert(inst.Arg)
   477  		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop:
   478  			instQueue.insert(inst.Out)
   479  		case syntax.InstMatch:
   480  		case syntax.InstFail:
   481  		case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   482  		default:
   483  		}
   484  	}
   485  	if p != notOnePass {
   486  		for i := range p.Inst {
   487  			p.Inst[i].Rune = onePassRunes[i]
   488  		}
   489  	}
   490  	return p
   491  }
   492  
   493  // walk visits each Inst in the prog once, and applies the argument
   494  // function(ip, next), in pre-order.
   495  func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) {
   496  	var walk1 func(uint32)
   497  	progQueue := newQueue(len(prog.Inst))
   498  	walk1 = func(ip uint32) {
   499  		if progQueue.contains(ip) {
   500  			return
   501  		}
   502  		progQueue.insert(ip)
   503  		inst := prog.Inst[ip]
   504  		switch inst.Op {
   505  		case syntax.InstAlt, syntax.InstAltMatch:
   506  			for _, f := range funcs {
   507  				f(ip, inst.Out)
   508  				f(ip, inst.Arg)
   509  			}
   510  			walk1(inst.Out)
   511  			walk1(inst.Arg)
   512  		default:
   513  			for _, f := range funcs {
   514  				f(ip, inst.Out)
   515  			}
   516  			walk1(inst.Out)
   517  		}
   518  	}
   519  	walk1(uint32(prog.Start))
   520  }
   521  
   522  // find returns the Insts that match the argument predicate function
   523  func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) {
   524  	matches = []uint32{}
   525  
   526  	for ip := range prog.Inst {
   527  		if f(prog, ip) {
   528  			matches = append(matches, uint32(ip))
   529  		}
   530  	}
   531  	return
   532  }
   533  
   534  var notOnePass *onePassProg = nil
   535  
   536  // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
   537  // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
   538  // Prog cannot be converted. For a one pass prog, the fundamental condition that must
   539  // be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
   540  func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
   541  	if prog.Start == 0 {
   542  		return notOnePass
   543  	}
   544  	// onepass regexp is anchored
   545  	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
   546  		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
   547  		return notOnePass
   548  	}
   549  	// every instruction leading to InstMatch must be EmptyEndText
   550  	for _, inst := range prog.Inst {
   551  		opOut := prog.Inst[inst.Out].Op
   552  		switch inst.Op {
   553  		default:
   554  			if opOut == syntax.InstMatch {
   555  				return notOnePass
   556  			}
   557  		case syntax.InstAlt, syntax.InstAltMatch:
   558  			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
   559  				return notOnePass
   560  			}
   561  		case syntax.InstEmptyWidth:
   562  			if opOut == syntax.InstMatch {
   563  				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
   564  					continue
   565  				}
   566  				return notOnePass
   567  			}
   568  		}
   569  	}
   570  	// Creates a slightly optimized copy of the original Prog
   571  	// that cleans up some Prog idioms that block valid onepass programs
   572  	p = onePassCopy(prog)
   573  
   574  	// checkAmbiguity on InstAlts, build onepass Prog if possible
   575  	p = makeOnePass(p)
   576  
   577  	if p != notOnePass {
   578  		cleanupOnePass(p, prog)
   579  	}
   580  	return p
   581  }