github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/src/pkg/regexp/syntax/prog.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "bytes" 9 "strconv" 10 "unicode" 11 ) 12 13 // Compiled program. 14 // May not belong in this package, but convenient for now. 15 16 // A Prog is a compiled regular expression program. 17 type Prog struct { 18 Inst []Inst 19 Start int // index of start instruction 20 NumCap int // number of InstCapture insts in re 21 } 22 23 // An InstOp is an instruction opcode. 24 type InstOp uint8 25 26 const ( 27 InstAlt InstOp = iota 28 InstAltMatch 29 InstCapture 30 InstEmptyWidth 31 InstMatch 32 InstFail 33 InstNop 34 InstRune 35 InstRune1 36 InstRuneAny 37 InstRuneAnyNotNL 38 ) 39 40 // An EmptyOp specifies a kind or mixture of zero-width assertions. 41 type EmptyOp uint8 42 43 const ( 44 EmptyBeginLine EmptyOp = 1 << iota 45 EmptyEndLine 46 EmptyBeginText 47 EmptyEndText 48 EmptyWordBoundary 49 EmptyNoWordBoundary 50 ) 51 52 // EmptyOpContext returns the zero-width assertions 53 // satisfied at the position between the runes r1 and r2. 54 // Passing r1 == -1 indicates that the position is 55 // at the beginning of the text. 56 // Passing r2 == -1 indicates that the position is 57 // at the end of the text. 58 func EmptyOpContext(r1, r2 rune) EmptyOp { 59 var op EmptyOp = EmptyNoWordBoundary 60 var boundary byte 61 switch { 62 case IsWordChar(r1): 63 boundary = 1 64 case r1 == '\n': 65 op |= EmptyBeginLine 66 case r1 < 0: 67 op |= EmptyBeginText | EmptyBeginLine 68 } 69 switch { 70 case IsWordChar(r2): 71 boundary ^= 1 72 case r2 == '\n': 73 op |= EmptyEndLine 74 case r2 < 0: 75 op |= EmptyEndText | EmptyEndLine 76 } 77 if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2) 78 op ^= (EmptyWordBoundary | EmptyNoWordBoundary) 79 } 80 return op 81 } 82 83 // IsWordChar reports whether r is consider a ``word character'' 84 // during the evaluation of the \b and \B zero-width assertions. 85 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. 86 func IsWordChar(r rune) bool { 87 return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' 88 } 89 90 // An Inst is a single instruction in a regular expression program. 91 type Inst struct { 92 Op InstOp 93 Out uint32 // all but InstMatch, InstFail 94 Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth 95 Rune []rune 96 } 97 98 func (p *Prog) String() string { 99 var b bytes.Buffer 100 dumpProg(&b, p) 101 return b.String() 102 } 103 104 // skipNop follows any no-op or capturing instructions 105 // and returns the resulting pc. 106 func (p *Prog) skipNop(pc uint32) *Inst { 107 i := &p.Inst[pc] 108 for i.Op == InstNop || i.Op == InstCapture { 109 pc = i.Out 110 i = &p.Inst[pc] 111 } 112 return i 113 } 114 115 // op returns i.Op but merges all the Rune special cases into InstRune 116 func (i *Inst) op() InstOp { 117 op := i.Op 118 switch op { 119 case InstRune1, InstRuneAny, InstRuneAnyNotNL: 120 op = InstRune 121 } 122 return op 123 } 124 125 // Prefix returns a literal string that all matches for the 126 // regexp must start with. Complete is true if the prefix 127 // is the entire match. 128 func (p *Prog) Prefix() (prefix string, complete bool) { 129 i := p.skipNop(uint32(p.Start)) 130 131 // Avoid allocation of buffer if prefix is empty. 132 if i.op() != InstRune || len(i.Rune) != 1 { 133 return "", i.Op == InstMatch 134 } 135 136 // Have prefix; gather characters. 137 var buf bytes.Buffer 138 for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 { 139 buf.WriteRune(i.Rune[0]) 140 i = p.skipNop(i.Out) 141 } 142 return buf.String(), i.Op == InstMatch 143 } 144 145 // StartCond returns the leading empty-width conditions that must 146 // be true in any match. It returns ^EmptyOp(0) if no matches are possible. 147 func (p *Prog) StartCond() EmptyOp { 148 var flag EmptyOp 149 pc := uint32(p.Start) 150 i := &p.Inst[pc] 151 Loop: 152 for { 153 switch i.Op { 154 case InstEmptyWidth: 155 flag |= EmptyOp(i.Arg) 156 case InstFail: 157 return ^EmptyOp(0) 158 case InstCapture, InstNop: 159 // skip 160 default: 161 break Loop 162 } 163 pc = i.Out 164 i = &p.Inst[pc] 165 } 166 return flag 167 } 168 169 // MatchRune returns true if the instruction matches (and consumes) r. 170 // It should only be called when i.Op == InstRune. 171 func (i *Inst) MatchRune(r rune) bool { 172 rune := i.Rune 173 174 // Special case: single-rune slice is from literal string, not char class. 175 if len(rune) == 1 { 176 r0 := rune[0] 177 if r == r0 { 178 return true 179 } 180 if Flags(i.Arg)&FoldCase != 0 { 181 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 182 if r == r1 { 183 return true 184 } 185 } 186 } 187 return false 188 } 189 190 // Peek at the first few pairs. 191 // Should handle ASCII well. 192 for j := 0; j < len(rune) && j <= 8; j += 2 { 193 if r < rune[j] { 194 return false 195 } 196 if r <= rune[j+1] { 197 return true 198 } 199 } 200 201 // Otherwise binary search. 202 lo := 0 203 hi := len(rune) / 2 204 for lo < hi { 205 m := lo + (hi-lo)/2 206 if c := rune[2*m]; c <= r { 207 if r <= rune[2*m+1] { 208 return true 209 } 210 lo = m + 1 211 } else { 212 hi = m 213 } 214 } 215 return false 216 } 217 218 // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char. 219 // Since we act on runes, it would be easy to support Unicode here. 220 func wordRune(r rune) bool { 221 return r == '_' || 222 ('A' <= r && r <= 'Z') || 223 ('a' <= r && r <= 'z') || 224 ('0' <= r && r <= '9') 225 } 226 227 // MatchEmptyWidth returns true if the instruction matches 228 // an empty string between the runes before and after. 229 // It should only be called when i.Op == InstEmptyWidth. 230 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { 231 switch EmptyOp(i.Arg) { 232 case EmptyBeginLine: 233 return before == '\n' || before == -1 234 case EmptyEndLine: 235 return after == '\n' || after == -1 236 case EmptyBeginText: 237 return before == -1 238 case EmptyEndText: 239 return after == -1 240 case EmptyWordBoundary: 241 return wordRune(before) != wordRune(after) 242 case EmptyNoWordBoundary: 243 return wordRune(before) == wordRune(after) 244 } 245 panic("unknown empty width arg") 246 } 247 248 func (i *Inst) String() string { 249 var b bytes.Buffer 250 dumpInst(&b, i) 251 return b.String() 252 } 253 254 func bw(b *bytes.Buffer, args ...string) { 255 for _, s := range args { 256 b.WriteString(s) 257 } 258 } 259 260 func dumpProg(b *bytes.Buffer, p *Prog) { 261 for j := range p.Inst { 262 i := &p.Inst[j] 263 pc := strconv.Itoa(j) 264 if len(pc) < 3 { 265 b.WriteString(" "[len(pc):]) 266 } 267 if j == p.Start { 268 pc += "*" 269 } 270 bw(b, pc, "\t") 271 dumpInst(b, i) 272 bw(b, "\n") 273 } 274 } 275 276 func u32(i uint32) string { 277 return strconv.FormatUint(uint64(i), 10) 278 } 279 280 func dumpInst(b *bytes.Buffer, i *Inst) { 281 switch i.Op { 282 case InstAlt: 283 bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) 284 case InstAltMatch: 285 bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) 286 case InstCapture: 287 bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) 288 case InstEmptyWidth: 289 bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) 290 case InstMatch: 291 bw(b, "match") 292 case InstFail: 293 bw(b, "fail") 294 case InstNop: 295 bw(b, "nop -> ", u32(i.Out)) 296 case InstRune: 297 if i.Rune == nil { 298 // shouldn't happen 299 bw(b, "rune <nil>") 300 } 301 bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) 302 if Flags(i.Arg)&FoldCase != 0 { 303 bw(b, "/i") 304 } 305 bw(b, " -> ", u32(i.Out)) 306 case InstRune1: 307 bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) 308 case InstRuneAny: 309 bw(b, "any -> ", u32(i.Out)) 310 case InstRuneAnyNotNL: 311 bw(b, "anynotnl -> ", u32(i.Out)) 312 } 313 }