github.com/guyezi/gofrontend@v0.0.0-20200228202240-7a62a49e62c0/libgo/go/regexp/syntax/prog.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "strconv" 9 "strings" 10 "unicode" 11 ) 12 13 // Compiled program. 14 // May not belong in this package, but convenient for now. 15 16 // A Prog is a compiled regular expression program. 17 type Prog struct { 18 Inst []Inst 19 Start int // index of start instruction 20 NumCap int // number of InstCapture insts in re 21 } 22 23 // An InstOp is an instruction opcode. 24 type InstOp uint8 25 26 const ( 27 InstAlt InstOp = iota 28 InstAltMatch 29 InstCapture 30 InstEmptyWidth 31 InstMatch 32 InstFail 33 InstNop 34 InstRune 35 InstRune1 36 InstRuneAny 37 InstRuneAnyNotNL 38 ) 39 40 var instOpNames = []string{ 41 "InstAlt", 42 "InstAltMatch", 43 "InstCapture", 44 "InstEmptyWidth", 45 "InstMatch", 46 "InstFail", 47 "InstNop", 48 "InstRune", 49 "InstRune1", 50 "InstRuneAny", 51 "InstRuneAnyNotNL", 52 } 53 54 func (i InstOp) String() string { 55 if uint(i) >= uint(len(instOpNames)) { 56 return "" 57 } 58 return instOpNames[i] 59 } 60 61 // An EmptyOp specifies a kind or mixture of zero-width assertions. 62 type EmptyOp uint8 63 64 const ( 65 EmptyBeginLine EmptyOp = 1 << iota 66 EmptyEndLine 67 EmptyBeginText 68 EmptyEndText 69 EmptyWordBoundary 70 EmptyNoWordBoundary 71 ) 72 73 // EmptyOpContext returns the zero-width assertions 74 // satisfied at the position between the runes r1 and r2. 75 // Passing r1 == -1 indicates that the position is 76 // at the beginning of the text. 77 // Passing r2 == -1 indicates that the position is 78 // at the end of the text. 79 func EmptyOpContext(r1, r2 rune) EmptyOp { 80 var op EmptyOp = EmptyNoWordBoundary 81 var boundary byte 82 switch { 83 case IsWordChar(r1): 84 boundary = 1 85 case r1 == '\n': 86 op |= EmptyBeginLine 87 case r1 < 0: 88 op |= EmptyBeginText | EmptyBeginLine 89 } 90 switch { 91 case IsWordChar(r2): 92 boundary ^= 1 93 case r2 == '\n': 94 op |= EmptyEndLine 95 case r2 < 0: 96 op |= EmptyEndText | EmptyEndLine 97 } 98 if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2) 99 op ^= (EmptyWordBoundary | EmptyNoWordBoundary) 100 } 101 return op 102 } 103 104 // IsWordChar reports whether r is consider a ``word character'' 105 // during the evaluation of the \b and \B zero-width assertions. 106 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. 107 func IsWordChar(r rune) bool { 108 return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' 109 } 110 111 // An Inst is a single instruction in a regular expression program. 112 type Inst struct { 113 Op InstOp 114 Out uint32 // all but InstMatch, InstFail 115 Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth 116 Rune []rune 117 } 118 119 func (p *Prog) String() string { 120 var b strings.Builder 121 dumpProg(&b, p) 122 return b.String() 123 } 124 125 // skipNop follows any no-op or capturing instructions. 126 func (p *Prog) skipNop(pc uint32) *Inst { 127 i := &p.Inst[pc] 128 for i.Op == InstNop || i.Op == InstCapture { 129 i = &p.Inst[i.Out] 130 } 131 return i 132 } 133 134 // op returns i.Op but merges all the Rune special cases into InstRune 135 func (i *Inst) op() InstOp { 136 op := i.Op 137 switch op { 138 case InstRune1, InstRuneAny, InstRuneAnyNotNL: 139 op = InstRune 140 } 141 return op 142 } 143 144 // Prefix returns a literal string that all matches for the 145 // regexp must start with. Complete is true if the prefix 146 // is the entire match. 147 func (p *Prog) Prefix() (prefix string, complete bool) { 148 i := p.skipNop(uint32(p.Start)) 149 150 // Avoid allocation of buffer if prefix is empty. 151 if i.op() != InstRune || len(i.Rune) != 1 { 152 return "", i.Op == InstMatch 153 } 154 155 // Have prefix; gather characters. 156 var buf strings.Builder 157 for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 { 158 buf.WriteRune(i.Rune[0]) 159 i = p.skipNop(i.Out) 160 } 161 return buf.String(), i.Op == InstMatch 162 } 163 164 // StartCond returns the leading empty-width conditions that must 165 // be true in any match. It returns ^EmptyOp(0) if no matches are possible. 166 func (p *Prog) StartCond() EmptyOp { 167 var flag EmptyOp 168 pc := uint32(p.Start) 169 i := &p.Inst[pc] 170 Loop: 171 for { 172 switch i.Op { 173 case InstEmptyWidth: 174 flag |= EmptyOp(i.Arg) 175 case InstFail: 176 return ^EmptyOp(0) 177 case InstCapture, InstNop: 178 // skip 179 default: 180 break Loop 181 } 182 pc = i.Out 183 i = &p.Inst[pc] 184 } 185 return flag 186 } 187 188 const noMatch = -1 189 190 // MatchRune reports whether the instruction matches (and consumes) r. 191 // It should only be called when i.Op == InstRune. 192 func (i *Inst) MatchRune(r rune) bool { 193 return i.MatchRunePos(r) != noMatch 194 } 195 196 // MatchRunePos checks whether the instruction matches (and consumes) r. 197 // If so, MatchRunePos returns the index of the matching rune pair 198 // (or, when len(i.Rune) == 1, rune singleton). 199 // If not, MatchRunePos returns -1. 200 // MatchRunePos should only be called when i.Op == InstRune. 201 func (i *Inst) MatchRunePos(r rune) int { 202 rune := i.Rune 203 204 switch len(rune) { 205 case 0: 206 return noMatch 207 208 case 1: 209 // Special case: single-rune slice is from literal string, not char class. 210 r0 := rune[0] 211 if r == r0 { 212 return 0 213 } 214 if Flags(i.Arg)&FoldCase != 0 { 215 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 216 if r == r1 { 217 return 0 218 } 219 } 220 } 221 return noMatch 222 223 case 2: 224 if r >= rune[0] && r <= rune[1] { 225 return 0 226 } 227 return noMatch 228 229 case 4, 6, 8: 230 // Linear search for a few pairs. 231 // Should handle ASCII well. 232 for j := 0; j < len(rune); j += 2 { 233 if r < rune[j] { 234 return noMatch 235 } 236 if r <= rune[j+1] { 237 return j / 2 238 } 239 } 240 return noMatch 241 } 242 243 // Otherwise binary search. 244 lo := 0 245 hi := len(rune) / 2 246 for lo < hi { 247 m := lo + (hi-lo)/2 248 if c := rune[2*m]; c <= r { 249 if r <= rune[2*m+1] { 250 return m 251 } 252 lo = m + 1 253 } else { 254 hi = m 255 } 256 } 257 return noMatch 258 } 259 260 // MatchEmptyWidth reports whether the instruction matches 261 // an empty string between the runes before and after. 262 // It should only be called when i.Op == InstEmptyWidth. 263 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { 264 switch EmptyOp(i.Arg) { 265 case EmptyBeginLine: 266 return before == '\n' || before == -1 267 case EmptyEndLine: 268 return after == '\n' || after == -1 269 case EmptyBeginText: 270 return before == -1 271 case EmptyEndText: 272 return after == -1 273 case EmptyWordBoundary: 274 return IsWordChar(before) != IsWordChar(after) 275 case EmptyNoWordBoundary: 276 return IsWordChar(before) == IsWordChar(after) 277 } 278 panic("unknown empty width arg") 279 } 280 281 func (i *Inst) String() string { 282 var b strings.Builder 283 dumpInst(&b, i) 284 return b.String() 285 } 286 287 func bw(b *strings.Builder, args ...string) { 288 for _, s := range args { 289 b.WriteString(s) 290 } 291 } 292 293 func dumpProg(b *strings.Builder, p *Prog) { 294 for j := range p.Inst { 295 i := &p.Inst[j] 296 pc := strconv.Itoa(j) 297 if len(pc) < 3 { 298 b.WriteString(" "[len(pc):]) 299 } 300 if j == p.Start { 301 pc += "*" 302 } 303 bw(b, pc, "\t") 304 dumpInst(b, i) 305 bw(b, "\n") 306 } 307 } 308 309 func u32(i uint32) string { 310 return strconv.FormatUint(uint64(i), 10) 311 } 312 313 func dumpInst(b *strings.Builder, i *Inst) { 314 switch i.Op { 315 case InstAlt: 316 bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) 317 case InstAltMatch: 318 bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) 319 case InstCapture: 320 bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) 321 case InstEmptyWidth: 322 bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) 323 case InstMatch: 324 bw(b, "match") 325 case InstFail: 326 bw(b, "fail") 327 case InstNop: 328 bw(b, "nop -> ", u32(i.Out)) 329 case InstRune: 330 if i.Rune == nil { 331 // shouldn't happen 332 bw(b, "rune <nil>") 333 } 334 bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) 335 if Flags(i.Arg)&FoldCase != 0 { 336 bw(b, "/i") 337 } 338 bw(b, " -> ", u32(i.Out)) 339 case InstRune1: 340 bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) 341 case InstRuneAny: 342 bw(b, "any -> ", u32(i.Out)) 343 case InstRuneAnyNotNL: 344 bw(b, "anynotnl -> ", u32(i.Out)) 345 } 346 }