github.com/gnolang/gno@v0.0.0-20240520182011-228e9d0192ce/gnovm/stdlibs/regexp/exec.gno (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package regexp 6 7 import ( 8 "io" 9 "regexp/syntax" 10 ) 11 12 // A queue is a 'sparse array' holding pending threads of execution. 13 // See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html 14 type queue struct { 15 sparse []uint32 16 dense []entry 17 } 18 19 // An entry is an entry on a queue. 20 // It holds both the instruction pc and the actual thread. 21 // Some queue entries are just place holders so that the machine 22 // knows it has considered that pc. Such entries have t == nil. 23 type entry struct { 24 pc uint32 25 t *thread 26 } 27 28 // A thread is the state of a single path through the machine: 29 // an instruction and a corresponding capture array. 30 // See https://swtch.com/~rsc/regexp/regexp2.html 31 type thread struct { 32 inst *syntax.Inst 33 cap []int 34 } 35 36 // A machine holds all the state during an NFA simulation for p. 37 type machine struct { 38 re *Regexp // corresponding Regexp 39 p *syntax.Prog // compiled program 40 q0, q1 queue // two queues for runq, nextq 41 pool []*thread // pool of available threads 42 matched bool // whether a match was found 43 matchcap []int // capture information for the match 44 45 inputs inputs 46 } 47 48 type inputs struct { 49 // cached inputs, to avoid allocation 50 bytes inputBytes 51 string inputString 52 reader inputReader 53 } 54 55 func (i *inputs) newBytes(b []byte) input { 56 i.bytes.str = b 57 return &i.bytes 58 } 59 60 func (i *inputs) newString(s string) input { 61 i.string.str = s 62 return &i.string 63 } 64 65 func (i *inputs) newReader(r io.RuneReader) input { 66 i.reader.r = r 67 i.reader.atEOT = false 68 i.reader.pos = 0 69 return &i.reader 70 } 71 72 func (i *inputs) clear() { 73 // We need to clear 1 of these. 74 // Avoid the expense of clearing the others (pointer write barrier). 75 if i.bytes.str != nil { 76 i.bytes.str = nil 77 } else if i.reader.r != nil { 78 i.reader.r = nil 79 } else { 80 i.string.str = "" 81 } 82 } 83 84 func (i *inputs) init(r io.RuneReader, b []byte, s string) (input, int) { 85 if r != nil { 86 return i.newReader(r), 0 87 } 88 if b != nil { 89 return i.newBytes(b), len(b) 90 } 91 return i.newString(s), len(s) 92 } 93 94 func (m *machine) init(ncap int) { 95 for _, t := range m.pool { 96 t.cap = t.cap[:ncap] 97 } 98 m.matchcap = m.matchcap[:ncap] 99 } 100 101 // alloc allocates a new thread with the given instruction. 102 // It uses the free pool if possible. 103 func (m *machine) alloc(i *syntax.Inst) *thread { 104 var t *thread 105 if n := len(m.pool); n > 0 { 106 t = m.pool[n-1] 107 m.pool = m.pool[:n-1] 108 } else { 109 t = new(thread) 110 t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) 111 } 112 t.inst = i 113 return t 114 } 115 116 // A lazyFlag is a lazily-evaluated syntax.EmptyOp, 117 // for checking zero-width flags like ^ $ \A \z \B \b. 118 // It records the pair of relevant runes and does not 119 // determine the implied flags until absolutely necessary 120 // (most of the time, that means never). 121 type lazyFlag uint64 122 123 func newLazyFlag(r1, r2 rune) lazyFlag { 124 return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2))) 125 } 126 127 func (f lazyFlag) match(op syntax.EmptyOp) bool { 128 if op == 0 { 129 return true 130 } 131 r1 := rune(f >> 32) 132 if op&syntax.EmptyBeginLine != 0 { 133 if r1 != '\n' && r1 >= 0 { 134 return false 135 } 136 op &^= syntax.EmptyBeginLine 137 } 138 if op&syntax.EmptyBeginText != 0 { 139 if r1 >= 0 { 140 return false 141 } 142 op &^= syntax.EmptyBeginText 143 } 144 if op == 0 { 145 return true 146 } 147 r2 := rune(f) 148 if op&syntax.EmptyEndLine != 0 { 149 if r2 != '\n' && r2 >= 0 { 150 return false 151 } 152 op &^= syntax.EmptyEndLine 153 } 154 if op&syntax.EmptyEndText != 0 { 155 if r2 >= 0 { 156 return false 157 } 158 op &^= syntax.EmptyEndText 159 } 160 if op == 0 { 161 return true 162 } 163 if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) { 164 op &^= syntax.EmptyWordBoundary 165 } else { 166 op &^= syntax.EmptyNoWordBoundary 167 } 168 return op == 0 169 } 170 171 // match runs the machine over the input starting at pos. 172 // It reports whether a match was found. 173 // If so, m.matchcap holds the submatch information. 174 func (m *machine) match(i input, pos int) bool { 175 startCond := m.re.cond 176 if startCond == ^syntax.EmptyOp(0) { // impossible 177 return false 178 } 179 m.matched = false 180 for i := range m.matchcap { 181 m.matchcap[i] = -1 182 } 183 runq, nextq := &m.q0, &m.q1 184 r, r1 := endOfText, endOfText 185 width, width1 := 0, 0 186 r, width = i.step(pos) 187 if r != endOfText { 188 r1, width1 = i.step(pos + width) 189 } 190 var flag lazyFlag 191 if pos == 0 { 192 flag = newLazyFlag(-1, r) 193 } else { 194 flag = i.context(pos) 195 } 196 for { 197 if len(runq.dense) == 0 { 198 if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 199 // Anchored match, past beginning of text. 200 break 201 } 202 if m.matched { 203 // Have match; finished exploring alternatives. 204 break 205 } 206 if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() { 207 // Match requires literal prefix; fast search for it. 208 advance := i.index(m.re, pos) 209 if advance < 0 { 210 break 211 } 212 pos += advance 213 r, width = i.step(pos) 214 r1, width1 = i.step(pos + width) 215 } 216 } 217 if !m.matched { 218 if len(m.matchcap) > 0 { 219 m.matchcap[0] = pos 220 } 221 m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil) 222 } 223 flag = newLazyFlag(r, r1) 224 m.step(runq, nextq, pos, pos+width, r, &flag) 225 if width == 0 { 226 break 227 } 228 if len(m.matchcap) == 0 && m.matched { 229 // Found a match and not paying attention 230 // to where it is, so any match will do. 231 break 232 } 233 pos += width 234 r, width = r1, width1 235 if r != endOfText { 236 r1, width1 = i.step(pos + width) 237 } 238 runq, nextq = nextq, runq 239 } 240 m.clear(nextq) 241 return m.matched 242 } 243 244 // clear frees all threads on the thread queue. 245 func (m *machine) clear(q *queue) { 246 for _, d := range q.dense { 247 if d.t != nil { 248 m.pool = append(m.pool, d.t) 249 } 250 } 251 q.dense = q.dense[:0] 252 } 253 254 // step executes one step of the machine, running each of the threads 255 // on runq and appending new threads to nextq. 256 // The step processes the rune c (which may be endOfText), 257 // which starts at position pos and ends at nextPos. 258 // nextCond gives the setting for the empty-width flags after c. 259 func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) { 260 longest := m.re.longest 261 for j := 0; j < len(runq.dense); j++ { 262 d := &runq.dense[j] 263 t := d.t 264 if t == nil { 265 continue 266 } 267 if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { 268 m.pool = append(m.pool, t) 269 continue 270 } 271 i := t.inst 272 add := false 273 switch i.Op { 274 default: 275 panic("bad inst") 276 277 case syntax.InstMatch: 278 if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { 279 t.cap[1] = pos 280 copy(m.matchcap, t.cap) 281 } 282 if !longest { 283 // First-match mode: cut off all lower-priority threads. 284 for _, d := range runq.dense[j+1:] { 285 if d.t != nil { 286 m.pool = append(m.pool, d.t) 287 } 288 } 289 runq.dense = runq.dense[:0] 290 } 291 m.matched = true 292 293 case syntax.InstRune: 294 add = i.MatchRune(c) 295 case syntax.InstRune1: 296 add = c == i.Rune[0] 297 case syntax.InstRuneAny: 298 add = true 299 case syntax.InstRuneAnyNotNL: 300 add = c != '\n' 301 } 302 if add { 303 t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) 304 } 305 if t != nil { 306 m.pool = append(m.pool, t) 307 } 308 } 309 runq.dense = runq.dense[:0] 310 } 311 312 // add adds an entry to q for pc, unless the q already has such an entry. 313 // It also recursively adds an entry for all instructions reachable from pc by following 314 // empty-width conditions satisfied by cond. pos gives the current position 315 // in the input. 316 func (m *machine) add(q *queue, pc uint32, pos int, cap_ []int, cond *lazyFlag, t *thread) *thread { 317 Again: 318 if pc == 0 { 319 return t 320 } 321 if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { 322 return t 323 } 324 325 j := len(q.dense) 326 q.dense = q.dense[:j+1] 327 d := &q.dense[j] 328 d.t = nil 329 d.pc = pc 330 q.sparse[pc] = uint32(j) 331 332 i := &m.p.Inst[pc] 333 switch i.Op { 334 default: 335 panic("unhandled") 336 case syntax.InstFail: 337 // nothing 338 case syntax.InstAlt, syntax.InstAltMatch: 339 t = m.add(q, i.Out, pos, cap_, cond, t) 340 pc = i.Arg 341 goto Again 342 case syntax.InstEmptyWidth: 343 if cond.match(syntax.EmptyOp(i.Arg)) { 344 pc = i.Out 345 goto Again 346 } 347 case syntax.InstNop: 348 pc = i.Out 349 goto Again 350 case syntax.InstCapture: 351 if int(i.Arg) < len(cap_) { 352 opos := cap_[i.Arg] 353 cap_[i.Arg] = pos 354 m.add(q, i.Out, pos, cap_, cond, nil) 355 cap_[i.Arg] = opos 356 } else { 357 pc = i.Out 358 goto Again 359 } 360 case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 361 if t == nil { 362 t = m.alloc(i) 363 } else { 364 t.inst = i 365 } 366 if len(cap_) > 0 && &t.cap[0] != &cap_[0] { 367 copy(t.cap, cap_) 368 } 369 d.t = t 370 t = nil 371 } 372 return t 373 } 374 375 type onePassMachine struct { 376 inputs inputs 377 matchcap []int 378 } 379 380 // XXX sync not yet supported 381 // var onePassPool sync.Pool 382 383 func newOnePassMachine() *onePassMachine { 384 // m, ok := onePassPool.Get().(*onePassMachine) 385 // if !ok { 386 m := new(onePassMachine) 387 //} 388 return m 389 } 390 391 func freeOnePassMachine(m *onePassMachine) { 392 m.inputs.clear() 393 // onePassPool.Put(m) 394 } 395 396 // doOnePass implements r.doExecute using the one-pass execution engine. 397 func (re *Regexp) doOnePass(ir io.RuneReader, ib []byte, is string, pos, ncap int, dstCap []int) []int { 398 startCond := re.cond 399 if startCond == ^syntax.EmptyOp(0) { // impossible 400 return nil 401 } 402 403 m := newOnePassMachine() 404 if cap(m.matchcap) < ncap { 405 m.matchcap = make([]int, ncap) 406 } else { 407 m.matchcap = m.matchcap[:ncap] 408 } 409 410 matched := false 411 for i := range m.matchcap { 412 m.matchcap[i] = -1 413 } 414 415 i, _ := m.inputs.init(ir, ib, is) 416 417 r, r1 := endOfText, endOfText 418 width, width1 := 0, 0 419 r, width = i.step(pos) 420 if r != endOfText { 421 r1, width1 = i.step(pos + width) 422 } 423 var flag lazyFlag 424 if pos == 0 { 425 flag = newLazyFlag(-1, r) 426 } else { 427 flag = i.context(pos) 428 } 429 pc := re.onepass.Start 430 inst := re.onepass.Inst[pc] 431 // If there is a simple literal prefix, skip over it. 432 if pos == 0 && flag.match(syntax.EmptyOp(inst.Arg)) && 433 len(re.prefix) > 0 && i.canCheckPrefix() { 434 // Match requires literal prefix; fast search for it. 435 if !i.hasPrefix(re) { 436 goto Return 437 } 438 pos += len(re.prefix) 439 r, width = i.step(pos) 440 r1, width1 = i.step(pos + width) 441 flag = i.context(pos) 442 pc = int(re.prefixEnd) 443 } 444 for { 445 inst = re.onepass.Inst[pc] 446 pc = int(inst.Out) 447 switch inst.Op { 448 default: 449 panic("bad inst") 450 case syntax.InstMatch: 451 matched = true 452 if len(m.matchcap) > 0 { 453 m.matchcap[0] = 0 454 m.matchcap[1] = pos 455 } 456 goto Return 457 case syntax.InstRune: 458 if !inst.MatchRune(r) { 459 goto Return 460 } 461 case syntax.InstRune1: 462 if r != inst.Rune[0] { 463 goto Return 464 } 465 case syntax.InstRuneAny: 466 // Nothing 467 case syntax.InstRuneAnyNotNL: 468 if r == '\n' { 469 goto Return 470 } 471 // peek at the input rune to see which branch of the Alt to take 472 case syntax.InstAlt, syntax.InstAltMatch: 473 pc = int(onePassNext(&inst, r)) 474 continue 475 case syntax.InstFail: 476 goto Return 477 case syntax.InstNop: 478 continue 479 case syntax.InstEmptyWidth: 480 if !flag.match(syntax.EmptyOp(inst.Arg)) { 481 goto Return 482 } 483 continue 484 case syntax.InstCapture: 485 if int(inst.Arg) < len(m.matchcap) { 486 m.matchcap[inst.Arg] = pos 487 } 488 continue 489 } 490 if width == 0 { 491 break 492 } 493 flag = newLazyFlag(r, r1) 494 pos += width 495 r, width = r1, width1 496 if r != endOfText { 497 r1, width1 = i.step(pos + width) 498 } 499 } 500 501 Return: 502 if !matched { 503 freeOnePassMachine(m) 504 return nil 505 } 506 507 dstCap = append(dstCap, m.matchcap...) 508 freeOnePassMachine(m) 509 return dstCap 510 } 511 512 // doMatch reports whether either r, b or s match the regexp. 513 func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool { 514 return re.doExecute(r, b, s, 0, 0, nil) != nil 515 } 516 517 // doExecute finds the leftmost match in the input, appends the position 518 // of its subexpressions to dstCap and returns dstCap. 519 // 520 // nil is returned if no matches are found and non-nil if matches are found. 521 func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int { 522 if dstCap == nil { 523 // Make sure 'return dstCap' is non-nil. 524 dstCap = arrayNoInts[:0:0] 525 } 526 527 if r == nil && len(b)+len(s) < re.minInputLen { 528 return nil 529 } 530 531 if re.onepass != nil { 532 return re.doOnePass(r, b, s, pos, ncap, dstCap) 533 } 534 if r == nil && len(b)+len(s) < re.maxBitStateLen { 535 return re.backtrack(b, s, pos, ncap, dstCap) 536 } 537 538 m := re.get() 539 i, _ := m.inputs.init(r, b, s) 540 541 m.init(ncap) 542 if !m.match(i, pos) { 543 re.put(m) 544 return nil 545 } 546 547 dstCap = append(dstCap, m.matchcap...) 548 re.put(m) 549 return dstCap 550 } 551 552 // arrayNoInts is returned by doExecute match if nil dstCap is passed 553 // to it with ncap=0. 554 var arrayNoInts [0]int