github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/regexp/syntax/prog.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "strconv" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // Compiled program. 15 // May not belong in this package, but convenient for now. 16 17 // A Prog is a compiled regular expression program. 18 type Prog struct { 19 Inst []Inst 20 Start int // index of start instruction 21 NumCap int // number of InstCapture insts in re 22 } 23 24 // An InstOp is an instruction opcode. 25 type InstOp uint8 26 27 const ( 28 InstAlt InstOp = iota 29 InstAltMatch 30 InstCapture 31 InstEmptyWidth 32 InstMatch 33 InstFail 34 InstNop 35 InstRune 36 InstRune1 37 InstRuneAny 38 InstRuneAnyNotNL 39 ) 40 41 var instOpNames = []string{ 42 "InstAlt", 43 "InstAltMatch", 44 "InstCapture", 45 "InstEmptyWidth", 46 "InstMatch", 47 "InstFail", 48 "InstNop", 49 "InstRune", 50 "InstRune1", 51 "InstRuneAny", 52 "InstRuneAnyNotNL", 53 } 54 55 func (i InstOp) String() string { 56 if uint(i) >= uint(len(instOpNames)) { 57 return "" 58 } 59 return instOpNames[i] 60 } 61 62 // An EmptyOp specifies a kind or mixture of zero-width assertions. 63 type EmptyOp uint8 64 65 const ( 66 EmptyBeginLine EmptyOp = 1 << iota 67 EmptyEndLine 68 EmptyBeginText 69 EmptyEndText 70 EmptyWordBoundary 71 EmptyNoWordBoundary 72 ) 73 74 // EmptyOpContext returns the zero-width assertions 75 // satisfied at the position between the runes r1 and r2. 76 // Passing r1 == -1 indicates that the position is 77 // at the beginning of the text. 78 // Passing r2 == -1 indicates that the position is 79 // at the end of the text. 80 func EmptyOpContext(r1, r2 rune) EmptyOp { 81 var op EmptyOp = EmptyNoWordBoundary 82 var boundary byte 83 switch { 84 case IsWordChar(r1): 85 boundary = 1 86 case r1 == '\n': 87 op |= EmptyBeginLine 88 case r1 < 0: 89 op |= EmptyBeginText | EmptyBeginLine 90 } 91 switch { 92 case IsWordChar(r2): 93 boundary ^= 1 94 case r2 == '\n': 95 op |= EmptyEndLine 96 case r2 < 0: 97 op |= EmptyEndText | EmptyEndLine 98 } 99 if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2) 100 op ^= (EmptyWordBoundary | EmptyNoWordBoundary) 101 } 102 return op 103 } 104 105 // IsWordChar reports whether r is considered a “word character” 106 // during the evaluation of the \b and \B zero-width assertions. 107 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. 108 func IsWordChar(r rune) bool { 109 // Test for lowercase letters first, as these occur more 110 // frequently than uppercase letters in common cases. 111 return 'a' <= r && r <= 'z' || 'A' <= r && r <= 'Z' || '0' <= r && r <= '9' || r == '_' 112 } 113 114 // An Inst is a single instruction in a regular expression program. 115 type Inst struct { 116 Op InstOp 117 Out uint32 // all but InstMatch, InstFail 118 Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth 119 Rune []rune 120 } 121 122 func (p *Prog) String() string { 123 var b strings.Builder 124 dumpProg(&b, p) 125 return b.String() 126 } 127 128 // skipNop follows any no-op or capturing instructions. 129 func (p *Prog) skipNop(pc uint32) *Inst { 130 i := &p.Inst[pc] 131 for i.Op == InstNop || i.Op == InstCapture { 132 i = &p.Inst[i.Out] 133 } 134 return i 135 } 136 137 // op returns i.Op but merges all the Rune special cases into InstRune 138 func (i *Inst) op() InstOp { 139 op := i.Op 140 switch op { 141 case InstRune1, InstRuneAny, InstRuneAnyNotNL: 142 op = InstRune 143 } 144 return op 145 } 146 147 // Prefix returns a literal string that all matches for the 148 // regexp must start with. Complete is true if the prefix 149 // is the entire match. 150 func (p *Prog) Prefix() (prefix string, complete bool) { 151 i := p.skipNop(uint32(p.Start)) 152 153 // Avoid allocation of buffer if prefix is empty. 154 if i.op() != InstRune || len(i.Rune) != 1 { 155 return "", i.Op == InstMatch 156 } 157 158 // Have prefix; gather characters. 159 var buf strings.Builder 160 for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError { 161 buf.WriteRune(i.Rune[0]) 162 i = p.skipNop(i.Out) 163 } 164 return buf.String(), i.Op == InstMatch 165 } 166 167 // StartCond returns the leading empty-width conditions that must 168 // be true in any match. It returns ^EmptyOp(0) if no matches are possible. 169 func (p *Prog) StartCond() EmptyOp { 170 var flag EmptyOp 171 pc := uint32(p.Start) 172 i := &p.Inst[pc] 173 Loop: 174 for { 175 switch i.Op { 176 case InstEmptyWidth: 177 flag |= EmptyOp(i.Arg) 178 case InstFail: 179 return ^EmptyOp(0) 180 case InstCapture, InstNop: 181 // skip 182 default: 183 break Loop 184 } 185 pc = i.Out 186 i = &p.Inst[pc] 187 } 188 return flag 189 } 190 191 const noMatch = -1 192 193 // MatchRune reports whether the instruction matches (and consumes) r. 194 // It should only be called when i.Op == [InstRune]. 195 func (i *Inst) MatchRune(r rune) bool { 196 return i.MatchRunePos(r) != noMatch 197 } 198 199 // MatchRunePos checks whether the instruction matches (and consumes) r. 200 // If so, MatchRunePos returns the index of the matching rune pair 201 // (or, when len(i.Rune) == 1, rune singleton). 202 // If not, MatchRunePos returns -1. 203 // MatchRunePos should only be called when i.Op == [InstRune]. 204 func (i *Inst) MatchRunePos(r rune) int { 205 rune := i.Rune 206 207 switch len(rune) { 208 case 0: 209 return noMatch 210 211 case 1: 212 // Special case: single-rune slice is from literal string, not char class. 213 r0 := rune[0] 214 if r == r0 { 215 return 0 216 } 217 if Flags(i.Arg)&FoldCase != 0 { 218 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 219 if r == r1 { 220 return 0 221 } 222 } 223 } 224 return noMatch 225 226 case 2: 227 if r >= rune[0] && r <= rune[1] { 228 return 0 229 } 230 return noMatch 231 232 case 4, 6, 8: 233 // Linear search for a few pairs. 234 // Should handle ASCII well. 235 for j := 0; j < len(rune); j += 2 { 236 if r < rune[j] { 237 return noMatch 238 } 239 if r <= rune[j+1] { 240 return j / 2 241 } 242 } 243 return noMatch 244 } 245 246 // Otherwise binary search. 247 lo := 0 248 hi := len(rune) / 2 249 for lo < hi { 250 m := int(uint(lo+hi) >> 1) 251 if c := rune[2*m]; c <= r { 252 if r <= rune[2*m+1] { 253 return m 254 } 255 lo = m + 1 256 } else { 257 hi = m 258 } 259 } 260 return noMatch 261 } 262 263 // MatchEmptyWidth reports whether the instruction matches 264 // an empty string between the runes before and after. 265 // It should only be called when i.Op == [InstEmptyWidth]. 266 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { 267 switch EmptyOp(i.Arg) { 268 case EmptyBeginLine: 269 return before == '\n' || before == -1 270 case EmptyEndLine: 271 return after == '\n' || after == -1 272 case EmptyBeginText: 273 return before == -1 274 case EmptyEndText: 275 return after == -1 276 case EmptyWordBoundary: 277 return IsWordChar(before) != IsWordChar(after) 278 case EmptyNoWordBoundary: 279 return IsWordChar(before) == IsWordChar(after) 280 } 281 panic("unknown empty width arg") 282 } 283 284 func (i *Inst) String() string { 285 var b strings.Builder 286 dumpInst(&b, i) 287 return b.String() 288 } 289 290 func bw(b *strings.Builder, args ...string) { 291 for _, s := range args { 292 b.WriteString(s) 293 } 294 } 295 296 func dumpProg(b *strings.Builder, p *Prog) { 297 for j := range p.Inst { 298 i := &p.Inst[j] 299 pc := strconv.Itoa(j) 300 if len(pc) < 3 { 301 b.WriteString(" "[len(pc):]) 302 } 303 if j == p.Start { 304 pc += "*" 305 } 306 bw(b, pc, "\t") 307 dumpInst(b, i) 308 bw(b, "\n") 309 } 310 } 311 312 func u32(i uint32) string { 313 return strconv.FormatUint(uint64(i), 10) 314 } 315 316 func dumpInst(b *strings.Builder, i *Inst) { 317 switch i.Op { 318 case InstAlt: 319 bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) 320 case InstAltMatch: 321 bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) 322 case InstCapture: 323 bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) 324 case InstEmptyWidth: 325 bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) 326 case InstMatch: 327 bw(b, "match") 328 case InstFail: 329 bw(b, "fail") 330 case InstNop: 331 bw(b, "nop -> ", u32(i.Out)) 332 case InstRune: 333 if i.Rune == nil { 334 // shouldn't happen 335 bw(b, "rune <nil>") 336 } 337 bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) 338 if Flags(i.Arg)&FoldCase != 0 { 339 bw(b, "/i") 340 } 341 bw(b, " -> ", u32(i.Out)) 342 case InstRune1: 343 bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) 344 case InstRuneAny: 345 bw(b, "any -> ", u32(i.Out)) 346 case InstRuneAnyNotNL: 347 bw(b, "anynotnl -> ", u32(i.Out)) 348 } 349 }