9fans.net/go@v0.0.7/cmd/sam/regexp.go (about) 1 package main 2 3 import "unicode/utf8" 4 5 var sel Rangeset 6 var lastregexp String 7 8 /* 9 * Machine Information 10 */ 11 12 type Inst struct { 13 type_ int 14 15 // former union 16 subid int 17 rclass int 18 right *Inst 19 20 // former union 21 next *Inst 22 } 23 24 const NPROG = 1024 25 26 var program [NPROG]Inst 27 var progp int 28 var startinst *Inst /* First inst. of program; might not be program[0] */ 29 var bstartinst *Inst /* same for backwards machine */ 30 31 type Ilist struct { 32 inst *Inst 33 se Rangeset 34 startp Posn 35 } 36 37 var tl []Ilist /* This list, next list */ 38 39 const NLIST = 127 40 41 var nl []Ilist 42 var list [2][NLIST + 1]Ilist /* +1 for trailing null */ 43 var sempty Rangeset 44 45 /* 46 * Actions and Tokens 47 * 48 * 0x10000xx are operators, value == precedence 49 * 0x20000xx are tokens, i.e. operands for operators 50 */ 51 const ( 52 OPERATOR = 0x1000000 /* Bit set in all operators */ 53 START = OPERATOR + 0 /* Start, used for marker on stack */ 54 RBRA = OPERATOR + 1 /* Right bracket, */ 55 LBRA = OPERATOR + 2 /* Left bracket, */ 56 OR = OPERATOR + 3 /* Alternation, | */ 57 CAT = OPERATOR + 4 /* Concatentation, implicit operator */ 58 STAR = OPERATOR + 5 /* Closure, * */ 59 PLUS = OPERATOR + 6 /* a+ == aa* */ 60 QUEST = OPERATOR + 7 /* a? == a|nothing, i.e. 0 or 1 a's */ 61 ANY = 0x2000000 /* Any character but newline, . */ 62 NOP = ANY + 1 /* No operation, internal use only */ 63 BOL = ANY + 2 /* Beginning of line, ^ */ 64 EOL = ANY + 3 /* End of line, $ */ 65 CCLASS = ANY + 4 /* Character class, [] */ 66 NCCLASS = ANY + 5 /* Negated character class, [^] */ 67 END = ANY + 0x77 /* Terminate: match found */ 68 69 ISATOR = OPERATOR 70 ISAND = ANY 71 72 QUOTED = 0x4000000 /* Bit set for \-ed lex characters */ 73 ) 74 75 /* 76 * Parser Information 77 */ 78 79 type Node struct { 80 first *Inst 81 last *Inst 82 } 83 84 const NSTACK = 20 85 86 var andstack [NSTACK]Node 87 var andp int 88 var atorstack [NSTACK]int 89 var atorp int 90 var lastwasand bool /* Last token was operand */ 91 var cursubid int 92 var subidstack [NSTACK]int 93 var subidp int 94 var backwards bool 95 var nbra int 96 var exprp []rune /* pointer to next character in source expression */ 97 const DCLASS = 10 /* allocation increment */ 98 var class [][]rune 99 var negateclass bool 100 101 func regerror(e Err) { 102 Strzero(&lastregexp) 103 error_(e) 104 } 105 106 func regerror_c(e Err, c rune) { 107 Strzero(&lastregexp) 108 error_c(e, c) 109 } 110 111 func newinst(t int) *Inst { 112 if progp >= NPROG { 113 regerror(Etoolong) 114 } 115 p := &program[progp] 116 progp++ 117 *p = Inst{} 118 p.type_ = t 119 return p 120 } 121 122 func realcompile(s []rune) *Inst { 123 startlex(s) 124 atorp = 0 125 andp = 0 126 subidp = 0 127 cursubid = 0 128 lastwasand = false 129 /* Start with a low priority operator to prime parser */ 130 pushator(START - 1) 131 for { 132 token := lex() 133 if token == END { 134 break 135 } 136 if token&ISATOR == OPERATOR { 137 operator(int(token)) 138 } else { 139 operand(int(token)) 140 } 141 } 142 /* Close with a low priority operator */ 143 evaluntil(START) 144 /* Force END */ 145 operand(END) 146 evaluntil(START) 147 if nbra != 0 { 148 regerror(Eleftpar) 149 } 150 andp-- /* points to first and only operand */ 151 return andstack[andp].first 152 } 153 154 func compile(s *String) { 155 if Strcmp(s, &lastregexp) == 0 { 156 return 157 } 158 for _, c := range class { 159 // free(c) 160 _ = c 161 } 162 class = class[:0] 163 progp = 0 164 backwards = false 165 startinst = realcompile(s.s) 166 optimize(0) 167 oprogp := progp 168 backwards = true 169 bstartinst = realcompile(s.s) 170 optimize(oprogp) 171 Strduplstr(&lastregexp, s) 172 } 173 174 func operand(t int) { 175 if lastwasand { 176 operator(CAT) /* catenate is implicit */ 177 } 178 i := newinst(t) 179 if t == CCLASS { 180 if negateclass { 181 i.type_ = NCCLASS /* UGH */ 182 } 183 i.rclass = len(class) - 1 /* UGH */ 184 } 185 pushand(i, i) 186 lastwasand = true 187 } 188 189 func operator(t int) { 190 if t == RBRA { 191 nbra-- 192 if nbra < 0 { 193 regerror(Erightpar) 194 } 195 } 196 if t == LBRA { 197 /* 198 * if(++cursubid >= NSUBEXP) 199 * regerror(Esubexp); 200 */ 201 cursubid++ /* silently ignored */ 202 nbra++ 203 if lastwasand { 204 operator(CAT) 205 } 206 } else { 207 evaluntil(t) 208 } 209 if t != RBRA { 210 pushator(t) 211 } 212 lastwasand = false 213 if t == STAR || t == QUEST || t == PLUS || t == RBRA { 214 lastwasand = true /* these look like operands */ 215 } 216 } 217 218 func cant(s string) { 219 panic_("regexp: can't happen: " + s) 220 } 221 222 func pushand(f *Inst, l *Inst) { 223 if andp >= len(andstack) { 224 cant("operand stack overflow") 225 } 226 a := &andstack[andp] 227 andp++ 228 a.first = f 229 a.last = l 230 } 231 232 func pushator(t int) { 233 if atorp >= NSTACK { 234 cant("operator stack overflow") 235 } 236 atorstack[atorp] = t 237 atorp++ 238 if cursubid >= NSUBEXP { 239 subidstack[subidp] = -1 240 subidp++ 241 } else { 242 subidstack[subidp] = cursubid 243 subidp++ 244 } 245 } 246 247 func popand(op rune) *Node { 248 if andp <= 0 { 249 if op != 0 { 250 regerror_c(Emissop, op) 251 } else { 252 regerror(Ebadregexp) 253 } 254 } 255 andp-- 256 return &andstack[andp] 257 } 258 259 func popator() int { 260 if atorp <= 0 { 261 cant("operator stack underflow") 262 } 263 subidp-- 264 atorp-- 265 return atorstack[atorp] 266 } 267 268 func evaluntil(pri int) { 269 for pri == RBRA || atorstack[atorp-1] >= pri { 270 var inst2 *Inst 271 var inst1 *Inst 272 var t *Node 273 var op2 *Node 274 var op1 *Node 275 switch popator() { 276 case LBRA: 277 op1 = popand('(') 278 inst2 = newinst(RBRA) 279 inst2.subid = subidstack[subidp] 280 op1.last.next = inst2 281 inst1 = newinst(LBRA) 282 inst1.subid = subidstack[subidp] 283 inst1.next = op1.first 284 pushand(inst1, inst2) 285 return /* must have been RBRA */ 286 default: 287 panic_("unknown regexp operator") 288 case OR: 289 op2 = popand('|') 290 op1 = popand('|') 291 inst2 = newinst(NOP) 292 op2.last.next = inst2 293 op1.last.next = inst2 294 inst1 = newinst(OR) 295 inst1.right = op1.first 296 inst1.next = op2.first 297 pushand(inst1, inst2) 298 case CAT: 299 op2 = popand(0) 300 op1 = popand(0) 301 if backwards && op2.first.type_ != END { 302 t = op1 303 op1 = op2 304 op2 = t 305 } 306 op1.last.next = op2.first 307 pushand(op1.first, op2.last) 308 case STAR: 309 op2 = popand('*') 310 inst1 = newinst(OR) 311 op2.last.next = inst1 312 inst1.right = op2.first 313 pushand(inst1, inst1) 314 case PLUS: 315 op2 = popand('+') 316 inst1 = newinst(OR) 317 op2.last.next = inst1 318 inst1.right = op2.first 319 pushand(op2.first, inst1) 320 case QUEST: 321 op2 = popand('?') 322 inst1 = newinst(OR) 323 inst2 = newinst(NOP) 324 inst1.next = inst2 325 inst1.right = op2.first 326 op2.last.next = inst2 327 pushand(inst1, inst2) 328 } 329 } 330 } 331 332 func optimize(start int) { 333 for i := start; program[i].type_ != END; i++ { 334 inst := &program[i] 335 target := inst.next 336 for target.type_ == NOP { 337 target = target.next 338 } 339 inst.next = target 340 } 341 } 342 343 // #ifdef DEBUG 344 func dumpstack() { 345 dprint("operators\n") 346 for ip := 0; ip < atorp; ip++ { 347 dprint("0%o\n", atorstack[ip]) 348 } 349 dprint("operands\n") 350 for stk := 0; stk < andp; stk++ { 351 dprint("0%o\t0%o\n", andstack[stk].first.type_, andstack[stk].last.type_) 352 } 353 } 354 355 func dump() { 356 l := 0 357 for { 358 p := &program[l] 359 dprint("%p:\t0%o\t%p\t%p\n", p, p.type_, p.next, p.right) 360 if p.type_ == 0 { 361 break 362 } 363 } 364 } 365 366 // #endif 367 368 func startlex(s []rune) { 369 exprp = s 370 nbra = 0 371 } 372 373 func lex() rune { 374 if len(exprp) == 0 { 375 return END 376 } 377 378 c := exprp[0] 379 exprp = exprp[1:] 380 switch c { 381 case '\\': 382 if len(exprp) > 0 { 383 c = exprp[0] 384 exprp = exprp[1:] 385 if c == 'n' { 386 c = '\n' 387 } 388 } 389 case '*': 390 c = STAR 391 case '?': 392 c = QUEST 393 case '+': 394 c = PLUS 395 case '|': 396 c = OR 397 case '.': 398 c = ANY 399 case '(': 400 c = LBRA 401 case ')': 402 c = RBRA 403 case '^': 404 c = BOL 405 case '$': 406 c = EOL 407 case '[': 408 c = CCLASS 409 bldcclass() 410 } 411 return c 412 } 413 414 func nextrec() rune { 415 if len(exprp) == 0 || (len(exprp) == 1 && exprp[0] == '\\') { 416 regerror(Ebadclass) 417 } 418 if exprp[0] == '\\' { 419 exprp = exprp[1:] 420 if exprp[0] == 'n' { 421 exprp = exprp[1:] 422 return '\n' 423 } 424 c := exprp[0] 425 exprp = exprp[1:] 426 return c | QUOTED 427 } 428 c := exprp[0] 429 exprp = exprp[1:] 430 return c 431 } 432 433 func bldcclass() { 434 var classp []rune 435 /* we have already seen the '[' */ 436 if exprp[0] == '^' { /* don't match newline in negate case */ 437 classp = append(classp, '\n') 438 negateclass = true 439 exprp = exprp[1:] 440 } else { 441 negateclass = false 442 } 443 for { 444 c1 := nextrec() 445 if c1 == ']' { 446 break 447 } 448 if c1 == '-' { 449 goto Error 450 } 451 if exprp[0] == '-' { 452 exprp = exprp[1:] /* eat '-' */ 453 c2 := nextrec() 454 if c2 == ']' { 455 goto Error 456 } 457 classp = append(classp, utf8.MaxRune, c1, c2) 458 } else { 459 classp = append(classp, c1&^QUOTED) 460 } 461 } 462 class = append(class, classp) 463 return 464 465 Error: 466 // free(classp) 467 regerror(Ebadclass) 468 } 469 470 func classmatch(classno int, c rune, negate bool) bool { 471 p := class[classno] 472 for len(p) > 0 { 473 if p[0] == utf8.MaxRune { 474 if p[1] <= c && c <= p[2] { 475 return !negate 476 } 477 p = p[3:] 478 } else { 479 r := p[0] 480 p = p[1:] 481 if r == c { 482 return !negate 483 } 484 } 485 } 486 return negate 487 } 488 489 /* 490 * Note optimization in addinst: 491 * *l must be pending when addinst called; if *l has been looked 492 * at already, the optimization is a bug. 493 */ 494 func addinst(l []Ilist, inst *Inst, sep *Rangeset) int { 495 i := 0 496 p := &l[i] 497 for p.inst != nil { 498 if p.inst == inst { 499 if sep.p[0].p1 < p.se.p[0].p1 { 500 p.se = *sep /* this would be bug */ 501 } 502 return 0 /* It's already there */ 503 } 504 i++ 505 p = &l[i] 506 } 507 p.inst = inst 508 p.se = *sep 509 l[i+1].inst = nil 510 return 1 511 } 512 513 func execute(f *File, startp Posn, eof Posn) bool { 514 flag := 0 515 p := startp 516 nnl := 0 517 wrapped := 0 518 startchar := rune(0) 519 if startinst.type_ < OPERATOR { 520 startchar = rune(startinst.type_) 521 } 522 523 list[1][0].inst = nil 524 list[0][0].inst = list[1][0].inst 525 sel.p[0].p1 = -1 526 /* Execute machine once for each character */ 527 for ; ; p++ { 528 doloop: 529 c := filereadc(f, p) 530 if p >= eof || c < 0 { 531 tmp21 := wrapped 532 wrapped++ 533 switch tmp21 { 534 case 0, /* let loop run one more click */ 535 2: 536 break 537 case 1: /* expired; wrap to beginning */ 538 if sel.p[0].p1 >= 0 || eof != INFINITY { 539 goto Return 540 } 541 list[1][0].inst = nil 542 list[0][0].inst = list[1][0].inst 543 p = 0 544 goto doloop 545 default: 546 goto Return 547 } 548 } else if ((wrapped != 0 && p >= startp) || sel.p[0].p1 > 0) && nnl == 0 { 549 break 550 } 551 /* fast check for first char */ 552 if startchar != 0 && nnl == 0 && c != startchar { 553 continue 554 } 555 tl = list[flag][:] 556 flag ^= 1 557 nl = list[flag][:] 558 nl[0].inst = nil 559 ntl := nnl 560 nnl = 0 561 if sel.p[0].p1 < 0 && (wrapped == 0 || p < startp || startp == eof) { 562 /* Add first instruction to this list */ 563 sempty.p[0].p1 = p 564 if addinst(tl, startinst, &sempty) != 0 { 565 ntl++ 566 if ntl >= NLIST { 567 goto Overflow 568 } 569 } 570 } 571 /* Execute machine until this list is empty */ 572 for tlp := 0; ; tlp++ { 573 inst := tl[tlp].inst 574 prev := inst 575 if inst == nil { 576 break 577 } 578 Switchstmt: 579 if inst == nil { 580 debug("%#x led to nil", prev.type_) 581 } 582 prev = inst 583 switch inst.type_ { 584 default: /* regular character */ 585 if inst.type_ == int(c) { 586 goto Addinst 587 } 588 case LBRA: 589 if inst.subid >= 0 { 590 tl[tlp].se.p[inst.subid].p1 = p 591 } 592 inst = inst.next 593 goto Switchstmt 594 case RBRA: 595 if inst.subid >= 0 { 596 tl[tlp].se.p[inst.subid].p2 = p 597 } 598 inst = inst.next 599 goto Switchstmt 600 case ANY: 601 if c != '\n' { 602 goto Addinst 603 } 604 case BOL: 605 if p == 0 || filereadc(f, p-1) == '\n' { 606 inst = inst.next 607 goto Switchstmt 608 } 609 case EOL: 610 if c == '\n' { 611 inst = inst.next 612 goto Switchstmt 613 } 614 case CCLASS: 615 if c >= 0 && classmatch(inst.rclass, c, false) { 616 goto Addinst 617 } 618 case NCCLASS: 619 if c >= 0 && classmatch(inst.rclass, c, true) { 620 goto Addinst 621 } 622 /* evaluate right choice later */ 623 case OR: 624 if inst.next == nil { 625 debug("OR no left") 626 } 627 if inst.right == nil { 628 debug("OR no right") 629 } 630 if addinst(tl, inst.right, &tl[tlp].se) != 0 { 631 ntl++ 632 if ntl >= NLIST { 633 goto Overflow 634 } 635 } 636 /* efficiency: advance and re-evaluate */ 637 inst = inst.next 638 goto Switchstmt 639 case END: /* Match! */ 640 tl[tlp].se.p[0].p2 = p 641 newmatch(&tl[tlp].se) 642 } 643 continue 644 645 Addinst: 646 if addinst(nl, inst.next, &tl[tlp].se) != 0 { 647 nnl++ 648 if nnl >= NLIST { 649 goto Overflow 650 } 651 } 652 } 653 } 654 Return: 655 return sel.p[0].p1 >= 0 656 657 Overflow: 658 error_(Eoverflow) 659 panic("unreachable") 660 } 661 662 func newmatch(sp *Rangeset) { 663 if sel.p[0].p1 < 0 || sp.p[0].p1 < sel.p[0].p1 || (sp.p[0].p1 == sel.p[0].p1 && sp.p[0].p2 > sel.p[0].p2) { 664 for i := 0; i < NSUBEXP; i++ { 665 sel.p[i] = sp.p[i] 666 } 667 } 668 } 669 670 func bexecute(f *File, startp Posn) bool { 671 flag := 0 672 p := startp 673 nnl := 0 674 wrapped := 0 675 startchar := rune(0) 676 if bstartinst.type_ < OPERATOR { 677 startchar = rune(bstartinst.type_) 678 } 679 680 list[1][0].inst = nil 681 list[0][0].inst = list[1][0].inst 682 sel.p[0].p1 = -1 683 /* Execute machine once for each character, including terminal NUL */ 684 for ; ; p-- { 685 doloop: 686 c := filereadc(f, p-1) 687 if c == -1 { 688 tmp23 := wrapped 689 wrapped++ 690 switch tmp23 { 691 case 0, /* let loop run one more click */ 692 2: 693 break 694 case 1: /* expired; wrap to end */ 695 if sel.p[0].p1 >= 0 { 696 goto Return 697 } 698 list[1][0].inst = nil 699 list[0][0].inst = list[1][0].inst 700 p = f.b.nc 701 goto doloop 702 case 3: 703 fallthrough 704 default: 705 goto Return 706 } 707 } else if ((wrapped != 0 && p <= startp) || sel.p[0].p1 > 0) && nnl == 0 { 708 break 709 } 710 /* fast check for first char */ 711 if startchar != 0 && nnl == 0 && c != startchar { 712 continue 713 } 714 tl = list[flag][:] 715 flag ^= 1 716 nl = list[flag][:] 717 nl[0].inst = nil 718 ntl := nnl 719 nnl = 0 720 if sel.p[0].p1 < 0 && (wrapped == 0 || p > startp) { 721 /* Add first instruction to this list */ 722 /* the minus is so the optimizations in addinst work */ 723 sempty.p[0].p1 = -p 724 if addinst(tl, bstartinst, &sempty) != 0 { 725 ntl++ 726 if ntl >= NLIST { 727 goto Overflow 728 } 729 } 730 } 731 /* Execute machine until this list is empty */ 732 for tlp := 0; ; tlp++ { 733 inst := tl[tlp].inst 734 if inst == nil { 735 break 736 } 737 Switchstmt: 738 switch inst.type_ { 739 default: /* regular character */ 740 if inst.type_ == int(c) { 741 goto Addinst 742 } 743 case LBRA: 744 if inst.subid >= 0 { 745 tl[tlp].se.p[inst.subid].p1 = p 746 } 747 inst = inst.next 748 goto Switchstmt 749 case RBRA: 750 if inst.subid >= 0 { 751 tl[tlp].se.p[inst.subid].p2 = p 752 } 753 inst = inst.next 754 goto Switchstmt 755 case ANY: 756 if c != '\n' { 757 goto Addinst 758 } 759 case BOL: 760 if c == '\n' || p == 0 { 761 inst = inst.next 762 goto Switchstmt 763 } 764 case EOL: 765 if p == f.b.nc || filereadc(f, p) == '\n' { 766 inst = inst.next 767 goto Switchstmt 768 } 769 case CCLASS: 770 if c >= 0 && classmatch(inst.rclass, c, false) { 771 goto Addinst 772 } 773 case NCCLASS: 774 if c >= 0 && classmatch(inst.rclass, c, true) { 775 goto Addinst 776 } 777 /* evaluate right choice later */ 778 case OR: 779 if addinst(tl[tlp:], inst.right, &tl[tlp].se) != 0 { 780 ntl++ 781 if ntl >= NLIST { 782 goto Overflow 783 } 784 } 785 /* efficiency: advance and re-evaluate */ 786 inst = inst.next 787 goto Switchstmt 788 case END: /* Match! */ 789 tl[tlp].se.p[0].p1 = -tl[tlp].se.p[0].p1 /* minus sign */ 790 tl[tlp].se.p[0].p2 = p 791 bnewmatch(&tl[tlp].se) 792 } 793 continue 794 795 Addinst: 796 if addinst(nl, inst.next, &tl[tlp].se) != 0 { 797 nnl++ 798 if nnl >= NLIST { 799 goto Overflow 800 } 801 } 802 } 803 } 804 Return: 805 return sel.p[0].p1 >= 0 806 807 Overflow: 808 error_(Eoverflow) 809 panic("unreachable") 810 811 } 812 813 func bnewmatch(sp *Rangeset) { 814 if sel.p[0].p1 < 0 || sp.p[0].p1 > sel.p[0].p2 || (sp.p[0].p1 == sel.p[0].p2 && sp.p[0].p2 < sel.p[0].p1) { 815 for i := 0; i < NSUBEXP; i++ { /* note the reversal; p1<=p2 */ 816 sel.p[i].p1 = sp.p[i].p2 817 sel.p[i].p2 = sp.p[i].p1 818 } 819 } 820 }