github.com/AESNooper/go/src@v0.0.0-20220218095104-b56a4ab1bbbb/regexp/syntax/prog.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "strconv" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // Compiled program. 15 // May not belong in this package, but convenient for now. 16 17 // A Prog is a compiled regular expression program. 18 type Prog struct { 19 Inst []Inst 20 Start int // index of start instruction 21 NumCap int // number of InstCapture insts in re 22 } 23 24 // An InstOp is an instruction opcode. 25 type InstOp uint8 26 27 const ( 28 InstAlt InstOp = iota 29 InstAltMatch 30 InstCapture 31 InstEmptyWidth 32 InstMatch 33 InstFail 34 InstNop 35 InstRune 36 InstRune1 37 InstRuneAny 38 InstRuneAnyNotNL 39 ) 40 41 var instOpNames = []string{ 42 "InstAlt", 43 "InstAltMatch", 44 "InstCapture", 45 "InstEmptyWidth", 46 "InstMatch", 47 "InstFail", 48 "InstNop", 49 "InstRune", 50 "InstRune1", 51 "InstRuneAny", 52 "InstRuneAnyNotNL", 53 } 54 55 func (i InstOp) String() string { 56 if uint(i) >= uint(len(instOpNames)) { 57 return "" 58 } 59 return instOpNames[i] 60 } 61 62 // An EmptyOp specifies a kind or mixture of zero-width assertions. 63 type EmptyOp uint8 64 65 const ( 66 EmptyBeginLine EmptyOp = 1 << iota 67 EmptyEndLine 68 EmptyBeginText 69 EmptyEndText 70 EmptyWordBoundary 71 EmptyNoWordBoundary 72 ) 73 74 // EmptyOpContext returns the zero-width assertions 75 // satisfied at the position between the runes r1 and r2. 76 // Passing r1 == -1 indicates that the position is 77 // at the beginning of the text. 78 // Passing r2 == -1 indicates that the position is 79 // at the end of the text. 80 func EmptyOpContext(r1, r2 rune) EmptyOp { 81 var op EmptyOp = EmptyNoWordBoundary 82 var boundary byte 83 switch { 84 case IsWordChar(r1): 85 boundary = 1 86 case r1 == '\n': 87 op |= EmptyBeginLine 88 case r1 < 0: 89 op |= EmptyBeginText | EmptyBeginLine 90 } 91 switch { 92 case IsWordChar(r2): 93 boundary ^= 1 94 case r2 == '\n': 95 op |= EmptyEndLine 96 case r2 < 0: 97 op |= EmptyEndText | EmptyEndLine 98 } 99 if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2) 100 op ^= (EmptyWordBoundary | EmptyNoWordBoundary) 101 } 102 return op 103 } 104 105 // IsWordChar reports whether r is consider a ``word character'' 106 // during the evaluation of the \b and \B zero-width assertions. 107 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. 108 func IsWordChar(r rune) bool { 109 return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' 110 } 111 112 // An Inst is a single instruction in a regular expression program. 113 type Inst struct { 114 Op InstOp 115 Out uint32 // all but InstMatch, InstFail 116 Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth 117 Rune []rune 118 } 119 120 func (p *Prog) String() string { 121 var b strings.Builder 122 dumpProg(&b, p) 123 return b.String() 124 } 125 126 // skipNop follows any no-op or capturing instructions. 127 func (p *Prog) skipNop(pc uint32) *Inst { 128 i := &p.Inst[pc] 129 for i.Op == InstNop || i.Op == InstCapture { 130 i = &p.Inst[i.Out] 131 } 132 return i 133 } 134 135 // op returns i.Op but merges all the Rune special cases into InstRune 136 func (i *Inst) op() InstOp { 137 op := i.Op 138 switch op { 139 case InstRune1, InstRuneAny, InstRuneAnyNotNL: 140 op = InstRune 141 } 142 return op 143 } 144 145 // Prefix returns a literal string that all matches for the 146 // regexp must start with. Complete is true if the prefix 147 // is the entire match. 148 func (p *Prog) Prefix() (prefix string, complete bool) { 149 i := p.skipNop(uint32(p.Start)) 150 151 // Avoid allocation of buffer if prefix is empty. 152 if i.op() != InstRune || len(i.Rune) != 1 { 153 return "", i.Op == InstMatch 154 } 155 156 // Have prefix; gather characters. 157 var buf strings.Builder 158 for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError { 159 buf.WriteRune(i.Rune[0]) 160 i = p.skipNop(i.Out) 161 } 162 return buf.String(), i.Op == InstMatch 163 } 164 165 // StartCond returns the leading empty-width conditions that must 166 // be true in any match. It returns ^EmptyOp(0) if no matches are possible. 167 func (p *Prog) StartCond() EmptyOp { 168 var flag EmptyOp 169 pc := uint32(p.Start) 170 i := &p.Inst[pc] 171 Loop: 172 for { 173 switch i.Op { 174 case InstEmptyWidth: 175 flag |= EmptyOp(i.Arg) 176 case InstFail: 177 return ^EmptyOp(0) 178 case InstCapture, InstNop: 179 // skip 180 default: 181 break Loop 182 } 183 pc = i.Out 184 i = &p.Inst[pc] 185 } 186 return flag 187 } 188 189 const noMatch = -1 190 191 // MatchRune reports whether the instruction matches (and consumes) r. 192 // It should only be called when i.Op == InstRune. 193 func (i *Inst) MatchRune(r rune) bool { 194 return i.MatchRunePos(r) != noMatch 195 } 196 197 // MatchRunePos checks whether the instruction matches (and consumes) r. 198 // If so, MatchRunePos returns the index of the matching rune pair 199 // (or, when len(i.Rune) == 1, rune singleton). 200 // If not, MatchRunePos returns -1. 201 // MatchRunePos should only be called when i.Op == InstRune. 202 func (i *Inst) MatchRunePos(r rune) int { 203 rune := i.Rune 204 205 switch len(rune) { 206 case 0: 207 return noMatch 208 209 case 1: 210 // Special case: single-rune slice is from literal string, not char class. 211 r0 := rune[0] 212 if r == r0 { 213 return 0 214 } 215 if Flags(i.Arg)&FoldCase != 0 { 216 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 217 if r == r1 { 218 return 0 219 } 220 } 221 } 222 return noMatch 223 224 case 2: 225 if r >= rune[0] && r <= rune[1] { 226 return 0 227 } 228 return noMatch 229 230 case 4, 6, 8: 231 // Linear search for a few pairs. 232 // Should handle ASCII well. 233 for j := 0; j < len(rune); j += 2 { 234 if r < rune[j] { 235 return noMatch 236 } 237 if r <= rune[j+1] { 238 return j / 2 239 } 240 } 241 return noMatch 242 } 243 244 // Otherwise binary search. 245 lo := 0 246 hi := len(rune) / 2 247 for lo < hi { 248 m := lo + (hi-lo)/2 249 if c := rune[2*m]; c <= r { 250 if r <= rune[2*m+1] { 251 return m 252 } 253 lo = m + 1 254 } else { 255 hi = m 256 } 257 } 258 return noMatch 259 } 260 261 // MatchEmptyWidth reports whether the instruction matches 262 // an empty string between the runes before and after. 263 // It should only be called when i.Op == InstEmptyWidth. 264 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { 265 switch EmptyOp(i.Arg) { 266 case EmptyBeginLine: 267 return before == '\n' || before == -1 268 case EmptyEndLine: 269 return after == '\n' || after == -1 270 case EmptyBeginText: 271 return before == -1 272 case EmptyEndText: 273 return after == -1 274 case EmptyWordBoundary: 275 return IsWordChar(before) != IsWordChar(after) 276 case EmptyNoWordBoundary: 277 return IsWordChar(before) == IsWordChar(after) 278 } 279 panic("unknown empty width arg") 280 } 281 282 func (i *Inst) String() string { 283 var b strings.Builder 284 dumpInst(&b, i) 285 return b.String() 286 } 287 288 func bw(b *strings.Builder, args ...string) { 289 for _, s := range args { 290 b.WriteString(s) 291 } 292 } 293 294 func dumpProg(b *strings.Builder, p *Prog) { 295 for j := range p.Inst { 296 i := &p.Inst[j] 297 pc := strconv.Itoa(j) 298 if len(pc) < 3 { 299 b.WriteString(" "[len(pc):]) 300 } 301 if j == p.Start { 302 pc += "*" 303 } 304 bw(b, pc, "\t") 305 dumpInst(b, i) 306 bw(b, "\n") 307 } 308 } 309 310 func u32(i uint32) string { 311 return strconv.FormatUint(uint64(i), 10) 312 } 313 314 func dumpInst(b *strings.Builder, i *Inst) { 315 switch i.Op { 316 case InstAlt: 317 bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) 318 case InstAltMatch: 319 bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) 320 case InstCapture: 321 bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) 322 case InstEmptyWidth: 323 bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) 324 case InstMatch: 325 bw(b, "match") 326 case InstFail: 327 bw(b, "fail") 328 case InstNop: 329 bw(b, "nop -> ", u32(i.Out)) 330 case InstRune: 331 if i.Rune == nil { 332 // shouldn't happen 333 bw(b, "rune <nil>") 334 } 335 bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) 336 if Flags(i.Arg)&FoldCase != 0 { 337 bw(b, "/i") 338 } 339 bw(b, " -> ", u32(i.Out)) 340 case InstRune1: 341 bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) 342 case InstRuneAny: 343 bw(b, "any -> ", u32(i.Out)) 344 case InstRuneAnyNotNL: 345 bw(b, "anynotnl -> ", u32(i.Out)) 346 } 347 }