github.com/tcnksm/go@v0.0.0-20141208075154-439b32936367/src/regexp/syntax/compile.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  import "unicode"
     8  
     9  // A patchList is a list of instruction pointers that need to be filled in (patched).
    10  // Because the pointers haven't been filled in yet, we can reuse their storage
    11  // to hold the list.  It's kind of sleazy, but works well in practice.
    12  // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
    13  //
    14  // These aren't really pointers: they're integers, so we can reinterpret them
    15  // this way without using package unsafe.  A value l denotes
    16  // p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1).
    17  // l == 0 denotes the empty list, okay because we start every program
    18  // with a fail instruction, so we'll never want to point at its output link.
    19  type patchList uint32
    20  
    21  func (l patchList) next(p *Prog) patchList {
    22  	i := &p.Inst[l>>1]
    23  	if l&1 == 0 {
    24  		return patchList(i.Out)
    25  	}
    26  	return patchList(i.Arg)
    27  }
    28  
    29  func (l patchList) patch(p *Prog, val uint32) {
    30  	for l != 0 {
    31  		i := &p.Inst[l>>1]
    32  		if l&1 == 0 {
    33  			l = patchList(i.Out)
    34  			i.Out = val
    35  		} else {
    36  			l = patchList(i.Arg)
    37  			i.Arg = val
    38  		}
    39  	}
    40  }
    41  
    42  func (l1 patchList) append(p *Prog, l2 patchList) patchList {
    43  	if l1 == 0 {
    44  		return l2
    45  	}
    46  	if l2 == 0 {
    47  		return l1
    48  	}
    49  
    50  	last := l1
    51  	for {
    52  		next := last.next(p)
    53  		if next == 0 {
    54  			break
    55  		}
    56  		last = next
    57  	}
    58  
    59  	i := &p.Inst[last>>1]
    60  	if last&1 == 0 {
    61  		i.Out = uint32(l2)
    62  	} else {
    63  		i.Arg = uint32(l2)
    64  	}
    65  	return l1
    66  }
    67  
    68  // A frag represents a compiled program fragment.
    69  type frag struct {
    70  	i   uint32    // index of first instruction
    71  	out patchList // where to record end instruction
    72  }
    73  
    74  type compiler struct {
    75  	p *Prog
    76  }
    77  
    78  // Compile compiles the regexp into a program to be executed.
    79  // The regexp should have been simplified already (returned from re.Simplify).
    80  func Compile(re *Regexp) (*Prog, error) {
    81  	var c compiler
    82  	c.init()
    83  	f := c.compile(re)
    84  	f.out.patch(c.p, c.inst(InstMatch).i)
    85  	c.p.Start = int(f.i)
    86  	return c.p, nil
    87  }
    88  
    89  func (c *compiler) init() {
    90  	c.p = new(Prog)
    91  	c.p.NumCap = 2 // implicit ( and ) for whole match $0
    92  	c.inst(InstFail)
    93  }
    94  
    95  var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
    96  var anyRune = []rune{0, unicode.MaxRune}
    97  
    98  func (c *compiler) compile(re *Regexp) frag {
    99  	switch re.Op {
   100  	case OpNoMatch:
   101  		return c.fail()
   102  	case OpEmptyMatch:
   103  		return c.nop()
   104  	case OpLiteral:
   105  		if len(re.Rune) == 0 {
   106  			return c.nop()
   107  		}
   108  		var f frag
   109  		for j := range re.Rune {
   110  			f1 := c.rune(re.Rune[j:j+1], re.Flags)
   111  			if j == 0 {
   112  				f = f1
   113  			} else {
   114  				f = c.cat(f, f1)
   115  			}
   116  		}
   117  		return f
   118  	case OpCharClass:
   119  		return c.rune(re.Rune, re.Flags)
   120  	case OpAnyCharNotNL:
   121  		return c.rune(anyRuneNotNL, 0)
   122  	case OpAnyChar:
   123  		return c.rune(anyRune, 0)
   124  	case OpBeginLine:
   125  		return c.empty(EmptyBeginLine)
   126  	case OpEndLine:
   127  		return c.empty(EmptyEndLine)
   128  	case OpBeginText:
   129  		return c.empty(EmptyBeginText)
   130  	case OpEndText:
   131  		return c.empty(EmptyEndText)
   132  	case OpWordBoundary:
   133  		return c.empty(EmptyWordBoundary)
   134  	case OpNoWordBoundary:
   135  		return c.empty(EmptyNoWordBoundary)
   136  	case OpCapture:
   137  		bra := c.cap(uint32(re.Cap << 1))
   138  		sub := c.compile(re.Sub[0])
   139  		ket := c.cap(uint32(re.Cap<<1 | 1))
   140  		return c.cat(c.cat(bra, sub), ket)
   141  	case OpStar:
   142  		return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
   143  	case OpPlus:
   144  		return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
   145  	case OpQuest:
   146  		return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
   147  	case OpConcat:
   148  		if len(re.Sub) == 0 {
   149  			return c.nop()
   150  		}
   151  		var f frag
   152  		for i, sub := range re.Sub {
   153  			if i == 0 {
   154  				f = c.compile(sub)
   155  			} else {
   156  				f = c.cat(f, c.compile(sub))
   157  			}
   158  		}
   159  		return f
   160  	case OpAlternate:
   161  		var f frag
   162  		for _, sub := range re.Sub {
   163  			f = c.alt(f, c.compile(sub))
   164  		}
   165  		return f
   166  	}
   167  	panic("regexp: unhandled case in compile")
   168  }
   169  
   170  func (c *compiler) inst(op InstOp) frag {
   171  	// TODO: impose length limit
   172  	f := frag{i: uint32(len(c.p.Inst))}
   173  	c.p.Inst = append(c.p.Inst, Inst{Op: op})
   174  	return f
   175  }
   176  
   177  func (c *compiler) nop() frag {
   178  	f := c.inst(InstNop)
   179  	f.out = patchList(f.i << 1)
   180  	return f
   181  }
   182  
   183  func (c *compiler) fail() frag {
   184  	return frag{}
   185  }
   186  
   187  func (c *compiler) cap(arg uint32) frag {
   188  	f := c.inst(InstCapture)
   189  	f.out = patchList(f.i << 1)
   190  	c.p.Inst[f.i].Arg = arg
   191  
   192  	if c.p.NumCap < int(arg)+1 {
   193  		c.p.NumCap = int(arg) + 1
   194  	}
   195  	return f
   196  }
   197  
   198  func (c *compiler) cat(f1, f2 frag) frag {
   199  	// concat of failure is failure
   200  	if f1.i == 0 || f2.i == 0 {
   201  		return frag{}
   202  	}
   203  
   204  	// TODO: elide nop
   205  
   206  	f1.out.patch(c.p, f2.i)
   207  	return frag{f1.i, f2.out}
   208  }
   209  
   210  func (c *compiler) alt(f1, f2 frag) frag {
   211  	// alt of failure is other
   212  	if f1.i == 0 {
   213  		return f2
   214  	}
   215  	if f2.i == 0 {
   216  		return f1
   217  	}
   218  
   219  	f := c.inst(InstAlt)
   220  	i := &c.p.Inst[f.i]
   221  	i.Out = f1.i
   222  	i.Arg = f2.i
   223  	f.out = f1.out.append(c.p, f2.out)
   224  	return f
   225  }
   226  
   227  func (c *compiler) quest(f1 frag, nongreedy bool) frag {
   228  	f := c.inst(InstAlt)
   229  	i := &c.p.Inst[f.i]
   230  	if nongreedy {
   231  		i.Arg = f1.i
   232  		f.out = patchList(f.i << 1)
   233  	} else {
   234  		i.Out = f1.i
   235  		f.out = patchList(f.i<<1 | 1)
   236  	}
   237  	f.out = f.out.append(c.p, f1.out)
   238  	return f
   239  }
   240  
   241  func (c *compiler) star(f1 frag, nongreedy bool) frag {
   242  	f := c.inst(InstAlt)
   243  	i := &c.p.Inst[f.i]
   244  	if nongreedy {
   245  		i.Arg = f1.i
   246  		f.out = patchList(f.i << 1)
   247  	} else {
   248  		i.Out = f1.i
   249  		f.out = patchList(f.i<<1 | 1)
   250  	}
   251  	f1.out.patch(c.p, f.i)
   252  	return f
   253  }
   254  
   255  func (c *compiler) plus(f1 frag, nongreedy bool) frag {
   256  	return frag{f1.i, c.star(f1, nongreedy).out}
   257  }
   258  
   259  func (c *compiler) empty(op EmptyOp) frag {
   260  	f := c.inst(InstEmptyWidth)
   261  	c.p.Inst[f.i].Arg = uint32(op)
   262  	f.out = patchList(f.i << 1)
   263  	return f
   264  }
   265  
   266  func (c *compiler) rune(r []rune, flags Flags) frag {
   267  	f := c.inst(InstRune)
   268  	i := &c.p.Inst[f.i]
   269  	i.Rune = r
   270  	flags &= FoldCase // only relevant flag is FoldCase
   271  	if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
   272  		// and sometimes not even that
   273  		flags &^= FoldCase
   274  	}
   275  	i.Arg = uint32(flags)
   276  	f.out = patchList(f.i << 1)
   277  
   278  	// Special cases for exec machine.
   279  	switch {
   280  	case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
   281  		i.Op = InstRune1
   282  	case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
   283  		i.Op = InstRuneAny
   284  	case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
   285  		i.Op = InstRuneAnyNotNL
   286  	}
   287  
   288  	return f
   289  }