github.com/ader1990/go@v0.0.0-20140630135419-8c24447fa791/src/pkg/regexp/onepass.go (about)

     1  // Copyright 2014 The Go Authors.  All rights reserved.
     2  
     3  package regexp
     4  
     5  import (
     6  	"bytes"
     7  	"regexp/syntax"
     8  	"sort"
     9  	"unicode"
    10  )
    11  
    12  // Use of this source code is governed by a BSD-style
    13  // license that can be found in the LICENSE file.
    14  
    15  // "One-pass" regexp execution.
    16  // Some regexps can be analyzed to determine that they never need
    17  // backtracking: they are guaranteed to run in one pass over the string
    18  // without bothering to save all the usual NFA state.
    19  // Detect those and execute them more quickly.
    20  
    21  // A onePassProg is a compiled one-pass regular expression program.
    22  // It is the same as syntax.Prog except for the use of onePassInst.
    23  type onePassProg struct {
    24  	Inst   []onePassInst
    25  	Start  int // index of start instruction
    26  	NumCap int // number of InstCapture insts in re
    27  }
    28  
    29  // A onePassInst is a single instruction in a one-pass regular expression program.
    30  // It is the same as syntax.Inst except for the new 'Next' field.
    31  type onePassInst struct {
    32  	syntax.Inst
    33  	Next []uint32
    34  }
    35  
    36  // OnePassPrefix returns a literal string that all matches for the
    37  // regexp must start with.  Complete is true if the prefix
    38  // is the entire match. Pc is the index of the last rune instruction
    39  // in the string. The OnePassPrefix skips over the mandatory
    40  // EmptyBeginText
    41  func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
    42  	i := &p.Inst[p.Start]
    43  	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
    44  		return "", i.Op == syntax.InstMatch, uint32(p.Start)
    45  	}
    46  	pc = i.Out
    47  	i = &p.Inst[pc]
    48  	for i.Op == syntax.InstNop {
    49  		pc = i.Out
    50  		i = &p.Inst[pc]
    51  	}
    52  	// Avoid allocation of buffer if prefix is empty.
    53  	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
    54  		return "", i.Op == syntax.InstMatch, uint32(p.Start)
    55  	}
    56  
    57  	// Have prefix; gather characters.
    58  	var buf bytes.Buffer
    59  	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
    60  		buf.WriteRune(i.Rune[0])
    61  		pc, i = i.Out, &p.Inst[i.Out]
    62  	}
    63  	return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc
    64  }
    65  
    66  // OnePassNext selects the next actionable state of the prog, based on the input character.
    67  // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
    68  // One of the alternates may ultimately lead without input to end of line. If the instruction
    69  // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
    70  func onePassNext(i *onePassInst, r rune) uint32 {
    71  	next := i.MatchRunePos(r)
    72  	if next >= 0 {
    73  		return i.Next[next]
    74  	}
    75  	if i.Op == syntax.InstAltMatch {
    76  		return i.Out
    77  	}
    78  	return 0
    79  }
    80  
    81  func iop(i *syntax.Inst) syntax.InstOp {
    82  	op := i.Op
    83  	switch op {
    84  	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
    85  		op = syntax.InstRune
    86  	}
    87  	return op
    88  }
    89  
    90  // Sparse Array implementation is used as a queueOnePass.
    91  type queueOnePass struct {
    92  	sparse          []uint32
    93  	dense           []uint32
    94  	size, nextIndex uint32
    95  }
    96  
    97  func (q *queueOnePass) empty() bool {
    98  	return q.nextIndex >= q.size
    99  }
   100  
   101  func (q *queueOnePass) next() (n uint32) {
   102  	n = q.dense[q.nextIndex]
   103  	q.nextIndex++
   104  	return
   105  }
   106  
   107  func (q *queueOnePass) clear() {
   108  	q.size = 0
   109  	q.nextIndex = 0
   110  }
   111  
   112  func (q *queueOnePass) reset() {
   113  	q.nextIndex = 0
   114  }
   115  
   116  func (q *queueOnePass) contains(u uint32) bool {
   117  	if u >= uint32(len(q.sparse)) {
   118  		return false
   119  	}
   120  	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
   121  }
   122  
   123  func (q *queueOnePass) insert(u uint32) {
   124  	if !q.contains(u) {
   125  		q.insertNew(u)
   126  	}
   127  }
   128  
   129  func (q *queueOnePass) insertNew(u uint32) {
   130  	if u >= uint32(len(q.sparse)) {
   131  		return
   132  	}
   133  	q.sparse[u] = q.size
   134  	q.dense[q.size] = u
   135  	q.size++
   136  }
   137  
   138  func newQueue(size int) (q *queueOnePass) {
   139  	return &queueOnePass{
   140  		sparse: make([]uint32, size),
   141  		dense:  make([]uint32, size),
   142  	}
   143  }
   144  
   145  // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
   146  // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
   147  // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
   148  // NextIp array with the single element mergeFailed is returned.
   149  // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
   150  const mergeFailed = uint32(0xffffffff)
   151  
   152  var (
   153  	noRune = []rune{}
   154  	noNext = []uint32{mergeFailed}
   155  )
   156  
   157  func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
   158  	leftLen := len(*leftRunes)
   159  	rightLen := len(*rightRunes)
   160  	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
   161  		panic("mergeRuneSets odd length []rune")
   162  	}
   163  	var (
   164  		lx, rx int
   165  	)
   166  	merged := make([]rune, 0)
   167  	next := make([]uint32, 0)
   168  	ok := true
   169  	defer func() {
   170  		if !ok {
   171  			merged = nil
   172  			next = nil
   173  		}
   174  	}()
   175  
   176  	ix := -1
   177  	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
   178  		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
   179  			return false
   180  		}
   181  		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
   182  		*newLow += 2
   183  		ix += 2
   184  		next = append(next, pc)
   185  		return true
   186  	}
   187  
   188  	for lx < leftLen || rx < rightLen {
   189  		switch {
   190  		case rx >= rightLen:
   191  			ok = extend(&lx, leftRunes, leftPC)
   192  		case lx >= leftLen:
   193  			ok = extend(&rx, rightRunes, rightPC)
   194  		case (*rightRunes)[rx] < (*leftRunes)[lx]:
   195  			ok = extend(&rx, rightRunes, rightPC)
   196  		default:
   197  			ok = extend(&lx, leftRunes, leftPC)
   198  		}
   199  		if !ok {
   200  			return noRune, noNext
   201  		}
   202  	}
   203  	return merged, next
   204  }
   205  
   206  // cleanupOnePass drops working memory, and restores certain shortcut instructions.
   207  func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
   208  	for ix, instOriginal := range original.Inst {
   209  		switch instOriginal.Op {
   210  		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
   211  		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
   212  			prog.Inst[ix].Next = nil
   213  		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   214  			prog.Inst[ix].Next = nil
   215  			prog.Inst[ix] = onePassInst{Inst: instOriginal}
   216  		}
   217  	}
   218  }
   219  
   220  // onePassCopy creates a copy of the original Prog, as we'll be modifying it
   221  func onePassCopy(prog *syntax.Prog) *onePassProg {
   222  	p := &onePassProg{
   223  		Start:  prog.Start,
   224  		NumCap: prog.NumCap,
   225  	}
   226  	for _, inst := range prog.Inst {
   227  		p.Inst = append(p.Inst, onePassInst{Inst: inst})
   228  	}
   229  
   230  	// rewrites one or more common Prog constructs that enable some otherwise
   231  	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
   232  	// ip A, that points to ips B & C.
   233  	// A:BC + B:DA => A:BC + B:CD
   234  	// A:BC + B:DC => A:DC + B:DC
   235  	for pc := range p.Inst {
   236  		switch p.Inst[pc].Op {
   237  		default:
   238  			continue
   239  		case syntax.InstAlt, syntax.InstAltMatch:
   240  			// A:Bx + B:Ay
   241  			p_A_Other := &p.Inst[pc].Out
   242  			p_A_Alt := &p.Inst[pc].Arg
   243  			// make sure a target is another Alt
   244  			instAlt := p.Inst[*p_A_Alt]
   245  			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
   246  				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
   247  				instAlt = p.Inst[*p_A_Alt]
   248  				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
   249  					continue
   250  				}
   251  			}
   252  			instOther := p.Inst[*p_A_Other]
   253  			// Analyzing both legs pointing to Alts is for another day
   254  			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
   255  				// too complicated
   256  				continue
   257  			}
   258  			// simple empty transition loop
   259  			// A:BC + B:DA => A:BC + B:DC
   260  			p_B_Alt := &p.Inst[*p_A_Alt].Out
   261  			p_B_Other := &p.Inst[*p_A_Alt].Arg
   262  			patch := false
   263  			if instAlt.Out == uint32(pc) {
   264  				patch = true
   265  			} else if instAlt.Arg == uint32(pc) {
   266  				patch = true
   267  				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
   268  			}
   269  			if patch {
   270  				*p_B_Alt = *p_A_Other
   271  			}
   272  
   273  			// empty transition to common target
   274  			// A:BC + B:DC => A:DC + B:DC
   275  			if *p_A_Other == *p_B_Alt {
   276  				*p_A_Alt = *p_B_Other
   277  			}
   278  		}
   279  	}
   280  	return p
   281  }
   282  
   283  // runeSlice exists to permit sorting the case-folded rune sets.
   284  type runeSlice []rune
   285  
   286  func (p runeSlice) Len() int           { return len(p) }
   287  func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
   288  func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
   289  
   290  // Sort is a convenience method.
   291  func (p runeSlice) Sort() {
   292  	sort.Sort(p)
   293  }
   294  
   295  var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
   296  var anyRune = []rune{0, unicode.MaxRune}
   297  
   298  // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
   299  // the match engine can always tell which branch to take. The routine may modify
   300  // p if it is turned into a onepass Prog. If it isn't possible for this to be a
   301  // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
   302  // to the size of the Prog.
   303  func makeOnePass(p *onePassProg) *onePassProg {
   304  	// If the machine is very long, it's not worth the time to check if we can use one pass.
   305  	if len(p.Inst) >= 1000 {
   306  		return notOnePass
   307  	}
   308  
   309  	var (
   310  		instQueue    = newQueue(len(p.Inst))
   311  		visitQueue   = newQueue(len(p.Inst))
   312  		build        func(uint32, *queueOnePass)
   313  		check        func(uint32, map[uint32]bool) bool
   314  		onePassRunes = make([][]rune, len(p.Inst))
   315  	)
   316  	build = func(pc uint32, q *queueOnePass) {
   317  		if q.contains(pc) {
   318  			return
   319  		}
   320  		inst := p.Inst[pc]
   321  		switch inst.Op {
   322  		case syntax.InstAlt, syntax.InstAltMatch:
   323  			q.insert(inst.Out)
   324  			build(inst.Out, q)
   325  			q.insert(inst.Arg)
   326  		case syntax.InstMatch, syntax.InstFail:
   327  		default:
   328  			q.insert(inst.Out)
   329  		}
   330  	}
   331  
   332  	// check that paths from Alt instructions are unambiguous, and rebuild the new
   333  	// program as a onepass program
   334  	check = func(pc uint32, m map[uint32]bool) (ok bool) {
   335  		ok = true
   336  		inst := &p.Inst[pc]
   337  		if visitQueue.contains(pc) {
   338  			return
   339  		}
   340  		visitQueue.insert(pc)
   341  		switch inst.Op {
   342  		case syntax.InstAlt, syntax.InstAltMatch:
   343  			ok = check(inst.Out, m) && check(inst.Arg, m)
   344  			// check no-input paths to InstMatch
   345  			matchOut := m[inst.Out]
   346  			matchArg := m[inst.Arg]
   347  			if matchOut && matchArg {
   348  				ok = false
   349  				break
   350  			}
   351  			// Match on empty goes in inst.Out
   352  			if matchArg {
   353  				inst.Out, inst.Arg = inst.Arg, inst.Out
   354  				matchOut, matchArg = matchArg, matchOut
   355  			}
   356  			if matchOut {
   357  				m[pc] = true
   358  				inst.Op = syntax.InstAltMatch
   359  			}
   360  
   361  			// build a dispatch operator from the two legs of the alt.
   362  			onePassRunes[pc], inst.Next = mergeRuneSets(
   363  				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
   364  			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
   365  				ok = false
   366  				break
   367  			}
   368  		case syntax.InstCapture, syntax.InstNop:
   369  			ok = check(inst.Out, m)
   370  			m[pc] = m[inst.Out]
   371  			// pass matching runes back through these no-ops.
   372  			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
   373  			inst.Next = []uint32{}
   374  			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   375  				inst.Next = append(inst.Next, inst.Out)
   376  			}
   377  		case syntax.InstEmptyWidth:
   378  			ok = check(inst.Out, m)
   379  			m[pc] = m[inst.Out]
   380  			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
   381  			inst.Next = []uint32{}
   382  			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   383  				inst.Next = append(inst.Next, inst.Out)
   384  			}
   385  		case syntax.InstMatch, syntax.InstFail:
   386  			m[pc] = inst.Op == syntax.InstMatch
   387  			break
   388  		case syntax.InstRune:
   389  			ok = check(inst.Out, m)
   390  			m[pc] = false
   391  			if len(inst.Next) > 0 {
   392  				break
   393  			}
   394  			if len(inst.Rune) == 0 {
   395  				onePassRunes[pc] = []rune{}
   396  				inst.Next = []uint32{inst.Out}
   397  				break
   398  			}
   399  			runes := make([]rune, 0)
   400  			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
   401  				r0 := inst.Rune[0]
   402  				runes = append(runes, r0, r0)
   403  				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   404  					runes = append(runes, r1, r1)
   405  				}
   406  				sort.Sort(runeSlice(runes))
   407  			} else {
   408  				runes = append(runes, inst.Rune...)
   409  			}
   410  			onePassRunes[pc] = runes
   411  			inst.Next = []uint32{}
   412  			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   413  				inst.Next = append(inst.Next, inst.Out)
   414  			}
   415  			inst.Op = syntax.InstRune
   416  		case syntax.InstRune1:
   417  			ok = check(inst.Out, m)
   418  			m[pc] = false
   419  			if len(inst.Next) > 0 {
   420  				break
   421  			}
   422  			runes := []rune{}
   423  			// expand case-folded runes
   424  			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
   425  				r0 := inst.Rune[0]
   426  				runes = append(runes, r0, r0)
   427  				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   428  					runes = append(runes, r1, r1)
   429  				}
   430  				sort.Sort(runeSlice(runes))
   431  			} else {
   432  				runes = append(runes, inst.Rune[0], inst.Rune[0])
   433  			}
   434  			onePassRunes[pc] = runes
   435  			inst.Next = []uint32{}
   436  			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   437  				inst.Next = append(inst.Next, inst.Out)
   438  			}
   439  			inst.Op = syntax.InstRune
   440  		case syntax.InstRuneAny:
   441  			ok = check(inst.Out, m)
   442  			m[pc] = false
   443  			if len(inst.Next) > 0 {
   444  				break
   445  			}
   446  			onePassRunes[pc] = append([]rune{}, anyRune...)
   447  			inst.Next = []uint32{inst.Out}
   448  		case syntax.InstRuneAnyNotNL:
   449  			ok = check(inst.Out, m)
   450  			m[pc] = false
   451  			if len(inst.Next) > 0 {
   452  				break
   453  			}
   454  			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
   455  			inst.Next = []uint32{}
   456  			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   457  				inst.Next = append(inst.Next, inst.Out)
   458  			}
   459  		}
   460  		return
   461  	}
   462  
   463  	instQueue.clear()
   464  	instQueue.insert(uint32(p.Start))
   465  	m := make(map[uint32]bool, len(p.Inst))
   466  	for !instQueue.empty() {
   467  		pc := instQueue.next()
   468  		inst := p.Inst[pc]
   469  		visitQueue.clear()
   470  		if !check(uint32(pc), m) {
   471  			p = notOnePass
   472  			break
   473  		}
   474  		switch inst.Op {
   475  		case syntax.InstAlt, syntax.InstAltMatch:
   476  			instQueue.insert(inst.Out)
   477  			instQueue.insert(inst.Arg)
   478  		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop:
   479  			instQueue.insert(inst.Out)
   480  		case syntax.InstMatch:
   481  		case syntax.InstFail:
   482  		case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   483  		default:
   484  		}
   485  	}
   486  	if p != notOnePass {
   487  		for i, _ := range p.Inst {
   488  			p.Inst[i].Rune = onePassRunes[i]
   489  		}
   490  	}
   491  	return p
   492  }
   493  
   494  // walk visits each Inst in the prog once, and applies the argument
   495  // function(ip, next), in pre-order.
   496  func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) {
   497  	var walk1 func(uint32)
   498  	progQueue := newQueue(len(prog.Inst))
   499  	walk1 = func(ip uint32) {
   500  		if progQueue.contains(ip) {
   501  			return
   502  		}
   503  		progQueue.insert(ip)
   504  		inst := prog.Inst[ip]
   505  		switch inst.Op {
   506  		case syntax.InstAlt, syntax.InstAltMatch:
   507  			for _, f := range funcs {
   508  				f(ip, inst.Out)
   509  				f(ip, inst.Arg)
   510  			}
   511  			walk1(inst.Out)
   512  			walk1(inst.Arg)
   513  		default:
   514  			for _, f := range funcs {
   515  				f(ip, inst.Out)
   516  			}
   517  			walk1(inst.Out)
   518  		}
   519  	}
   520  	walk1(uint32(prog.Start))
   521  }
   522  
   523  // find returns the Insts that match the argument predicate function
   524  func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) {
   525  	matches = []uint32{}
   526  
   527  	for ip := range prog.Inst {
   528  		if f(prog, ip) {
   529  			matches = append(matches, uint32(ip))
   530  		}
   531  	}
   532  	return
   533  }
   534  
   535  var notOnePass *onePassProg = nil
   536  
   537  // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
   538  // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
   539  // Prog cannot be converted. For a one pass prog, the fundamental condition that must
   540  // be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
   541  func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
   542  	if prog.Start == 0 {
   543  		return notOnePass
   544  	}
   545  	// onepass regexp is anchored
   546  	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
   547  		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
   548  		return notOnePass
   549  	}
   550  	// every instruction leading to InstMatch must be EmptyEndText
   551  	for _, inst := range prog.Inst {
   552  		opOut := prog.Inst[inst.Out].Op
   553  		switch inst.Op {
   554  		default:
   555  			if opOut == syntax.InstMatch {
   556  				return notOnePass
   557  			}
   558  		case syntax.InstAlt, syntax.InstAltMatch:
   559  			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
   560  				return notOnePass
   561  			}
   562  		case syntax.InstEmptyWidth:
   563  			if opOut == syntax.InstMatch {
   564  				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
   565  					continue
   566  				}
   567  				return notOnePass
   568  			}
   569  		}
   570  	}
   571  	// Creates a slightly optimized copy of the original Prog
   572  	// that cleans up some Prog idioms that block valid onepass programs
   573  	p = onePassCopy(prog)
   574  
   575  	// checkAmbiguity on InstAlts, build onepass Prog if possible
   576  	p = makeOnePass(p)
   577  
   578  	if p != notOnePass {
   579  		cleanupOnePass(p, prog)
   580  	}
   581  	return p
   582  }