github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/regexp/syntax/prog.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "bytes" 9 "strconv" 10 "unicode" 11 ) 12 13 // Compiled program. 14 // May not belong in this package, but convenient for now. 15 16 // A Prog is a compiled regular expression program. 17 type Prog struct { 18 Inst []Inst 19 Start int // index of start instruction 20 NumCap int // number of InstCapture insts in re 21 } 22 23 // An InstOp is an instruction opcode. 24 type InstOp uint8 25 26 const ( 27 InstAlt InstOp = iota 28 InstAltMatch 29 InstCapture 30 InstEmptyWidth 31 InstMatch 32 InstFail 33 InstNop 34 InstRune 35 InstRune1 36 InstRuneAny 37 InstRuneAnyNotNL 38 ) 39 40 // An EmptyOp specifies a kind or mixture of zero-width assertions. 41 type EmptyOp uint8 42 43 const ( 44 EmptyBeginLine EmptyOp = 1 << iota 45 EmptyEndLine 46 EmptyBeginText 47 EmptyEndText 48 EmptyWordBoundary 49 EmptyNoWordBoundary 50 ) 51 52 // EmptyOpContext returns the zero-width assertions 53 // satisfied at the position between the runes r1 and r2. 54 // Passing r1 == -1 indicates that the position is 55 // at the beginning of the text. 56 // Passing r2 == -1 indicates that the position is 57 // at the end of the text. 58 func EmptyOpContext(r1, r2 rune) EmptyOp { 59 var op EmptyOp 60 if r1 < 0 { 61 op |= EmptyBeginText | EmptyBeginLine 62 } 63 if r1 == '\n' { 64 op |= EmptyBeginLine 65 } 66 if r2 < 0 { 67 op |= EmptyEndText | EmptyEndLine 68 } 69 if r2 == '\n' { 70 op |= EmptyEndLine 71 } 72 if IsWordChar(r1) != IsWordChar(r2) { 73 op |= EmptyWordBoundary 74 } else { 75 op |= EmptyNoWordBoundary 76 } 77 return op 78 } 79 80 // IsWordChar reports whether r is consider a ``word character'' 81 // during the evaluation of the \b and \B zero-width assertions. 82 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. 83 func IsWordChar(r rune) bool { 84 return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' 85 } 86 87 // An Inst is a single instruction in a regular expression program. 88 type Inst struct { 89 Op InstOp 90 Out uint32 // all but InstMatch, InstFail 91 Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth 92 Rune []rune 93 } 94 95 func (p *Prog) String() string { 96 var b bytes.Buffer 97 dumpProg(&b, p) 98 return b.String() 99 } 100 101 // skipNop follows any no-op or capturing instructions 102 // and returns the resulting pc. 103 func (p *Prog) skipNop(pc uint32) *Inst { 104 i := &p.Inst[pc] 105 for i.Op == InstNop || i.Op == InstCapture { 106 pc = i.Out 107 i = &p.Inst[pc] 108 } 109 return i 110 } 111 112 // op returns i.Op but merges all the Rune special cases into InstRune 113 func (i *Inst) op() InstOp { 114 op := i.Op 115 switch op { 116 case InstRune1, InstRuneAny, InstRuneAnyNotNL: 117 op = InstRune 118 } 119 return op 120 } 121 122 // Prefix returns a literal string that all matches for the 123 // regexp must start with. Complete is true if the prefix 124 // is the entire match. 125 func (p *Prog) Prefix() (prefix string, complete bool) { 126 i := p.skipNop(uint32(p.Start)) 127 128 // Avoid allocation of buffer if prefix is empty. 129 if i.op() != InstRune || len(i.Rune) != 1 { 130 return "", i.Op == InstMatch 131 } 132 133 // Have prefix; gather characters. 134 var buf bytes.Buffer 135 for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 { 136 buf.WriteRune(i.Rune[0]) 137 i = p.skipNop(i.Out) 138 } 139 return buf.String(), i.Op == InstMatch 140 } 141 142 // StartCond returns the leading empty-width conditions that must 143 // be true in any match. It returns ^EmptyOp(0) if no matches are possible. 144 func (p *Prog) StartCond() EmptyOp { 145 var flag EmptyOp 146 pc := uint32(p.Start) 147 i := &p.Inst[pc] 148 Loop: 149 for { 150 switch i.Op { 151 case InstEmptyWidth: 152 flag |= EmptyOp(i.Arg) 153 case InstFail: 154 return ^EmptyOp(0) 155 case InstCapture, InstNop: 156 // skip 157 default: 158 break Loop 159 } 160 pc = i.Out 161 i = &p.Inst[pc] 162 } 163 return flag 164 } 165 166 // MatchRune returns true if the instruction matches (and consumes) r. 167 // It should only be called when i.Op == InstRune. 168 func (i *Inst) MatchRune(r rune) bool { 169 rune := i.Rune 170 171 // Special case: single-rune slice is from literal string, not char class. 172 if len(rune) == 1 { 173 r0 := rune[0] 174 if r == r0 { 175 return true 176 } 177 if Flags(i.Arg)&FoldCase != 0 { 178 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 179 if r == r1 { 180 return true 181 } 182 } 183 } 184 return false 185 } 186 187 // Peek at the first few pairs. 188 // Should handle ASCII well. 189 for j := 0; j < len(rune) && j <= 8; j += 2 { 190 if r < rune[j] { 191 return false 192 } 193 if r <= rune[j+1] { 194 return true 195 } 196 } 197 198 // Otherwise binary search. 199 lo := 0 200 hi := len(rune) / 2 201 for lo < hi { 202 m := lo + (hi-lo)/2 203 if c := rune[2*m]; c <= r { 204 if r <= rune[2*m+1] { 205 return true 206 } 207 lo = m + 1 208 } else { 209 hi = m 210 } 211 } 212 return false 213 } 214 215 // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char. 216 // Since we act on runes, it would be easy to support Unicode here. 217 func wordRune(r rune) bool { 218 return r == '_' || 219 ('A' <= r && r <= 'Z') || 220 ('a' <= r && r <= 'z') || 221 ('0' <= r && r <= '9') 222 } 223 224 // MatchEmptyWidth returns true if the instruction matches 225 // an empty string between the runes before and after. 226 // It should only be called when i.Op == InstEmptyWidth. 227 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { 228 switch EmptyOp(i.Arg) { 229 case EmptyBeginLine: 230 return before == '\n' || before == -1 231 case EmptyEndLine: 232 return after == '\n' || after == -1 233 case EmptyBeginText: 234 return before == -1 235 case EmptyEndText: 236 return after == -1 237 case EmptyWordBoundary: 238 return wordRune(before) != wordRune(after) 239 case EmptyNoWordBoundary: 240 return wordRune(before) == wordRune(after) 241 } 242 panic("unknown empty width arg") 243 } 244 245 func (i *Inst) String() string { 246 var b bytes.Buffer 247 dumpInst(&b, i) 248 return b.String() 249 } 250 251 func bw(b *bytes.Buffer, args ...string) { 252 for _, s := range args { 253 b.WriteString(s) 254 } 255 } 256 257 func dumpProg(b *bytes.Buffer, p *Prog) { 258 for j := range p.Inst { 259 i := &p.Inst[j] 260 pc := strconv.Itoa(j) 261 if len(pc) < 3 { 262 b.WriteString(" "[len(pc):]) 263 } 264 if j == p.Start { 265 pc += "*" 266 } 267 bw(b, pc, "\t") 268 dumpInst(b, i) 269 bw(b, "\n") 270 } 271 } 272 273 func u32(i uint32) string { 274 return strconv.FormatUint(uint64(i), 10) 275 } 276 277 func dumpInst(b *bytes.Buffer, i *Inst) { 278 switch i.Op { 279 case InstAlt: 280 bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) 281 case InstAltMatch: 282 bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) 283 case InstCapture: 284 bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) 285 case InstEmptyWidth: 286 bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) 287 case InstMatch: 288 bw(b, "match") 289 case InstFail: 290 bw(b, "fail") 291 case InstNop: 292 bw(b, "nop -> ", u32(i.Out)) 293 case InstRune: 294 if i.Rune == nil { 295 // shouldn't happen 296 bw(b, "rune <nil>") 297 } 298 bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) 299 if Flags(i.Arg)&FoldCase != 0 { 300 bw(b, "/i") 301 } 302 bw(b, " -> ", u32(i.Out)) 303 case InstRune1: 304 bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) 305 case InstRuneAny: 306 bw(b, "any -> ", u32(i.Out)) 307 case InstRuneAnyNotNL: 308 bw(b, "anynotnl -> ", u32(i.Out)) 309 } 310 }