github.com/couchbaselabs/nex@v0.0.0-20230419191105-421cb5932838/nex.go (about) 1 // Substantial copy-and-paste from src/pkg/regexp. 2 package main 3 4 import ( 5 "bufio" 6 "errors" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "log" 11 "os" 12 "sort" 13 "strconv" 14 "strings" 15 ) 16 import ( 17 "go/format" 18 "go/parser" 19 "go/printer" 20 "go/token" 21 ) 22 23 type rule struct { 24 regex []rune 25 code string 26 startCode string 27 endCode string 28 kid []*rule 29 id string 30 } 31 32 var ( 33 ErrInternal = errors.New("internal error") 34 ErrUnmatchedLpar = errors.New("unmatched '('") 35 ErrUnmatchedRpar = errors.New("unmatched ')'") 36 ErrUnmatchedLbkt = errors.New("unmatched '['") 37 ErrUnmatchedRbkt = errors.New("unmatched ']'") 38 ErrBadRange = errors.New("bad range in character class") 39 ErrExtraneousBackslash = errors.New("extraneous backslash") 40 ErrBareClosure = errors.New("closure applies to nothing") 41 ErrBadBackslash = errors.New("illegal backslash escape") 42 ErrExpectedLBrace = errors.New("expected '{'") 43 ErrUnmatchedLBrace = errors.New("unmatched '{'") 44 ErrUnexpectedEOF = errors.New("unexpected EOF") 45 ErrUnexpectedNewline = errors.New("unexpected newline") 46 ErrUnexpectedLAngle = errors.New("unexpected '<'") 47 ErrUnmatchedLAngle = errors.New("unmatched '<'") 48 ErrUnmatchedRAngle = errors.New("unmatched '>'") 49 ) 50 51 func ispunct(c rune) bool { 52 for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" { 53 if c == r { 54 return true 55 } 56 } 57 return false 58 } 59 60 var escapes = []rune("abfnrtv") 61 var escaped = []rune("\a\b\f\n\r\t\v") 62 63 func escape(c rune) rune { 64 for i, b := range escapes { 65 if b == c { 66 return escaped[i] 67 } 68 } 69 return -1 70 } 71 72 const ( 73 kNil = iota 74 kRune 75 kClass 76 kWild 77 kStart 78 kEnd 79 ) 80 81 type edge struct { 82 kind int // Rune/Class/Wild/Nil. 83 r rune // Rune for rune edges. 84 lim []rune // Pairs of limits for character class edges. 85 negate bool // True if the character class is negated. 86 dst *node // Destination node. 87 } 88 type node struct { 89 e edges // Outedges. 90 n int // Index number. Scoped to a family. 91 accept bool // True if this is an accepting state. 92 set []int // The NFA nodes represented by a DFA node. 93 } 94 95 type edges []*edge 96 97 func (e edges) Len() int { 98 return len(e) 99 } 100 func (e edges) Less(i, j int) bool { 101 return e[i].r < e[j].r 102 } 103 104 func (e edges) Swap(i, j int) { 105 e[i], e[j] = e[j], e[i] 106 } 107 108 type RuneSlice []rune 109 110 func (p RuneSlice) Len() int { return len(p) } 111 func (p RuneSlice) Less(i, j int) bool { return p[i] < p[j] } 112 func (p RuneSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 113 114 // Print a graph in DOT format given the start node. 115 // 116 // $ dot -Tps input.dot -o output.ps 117 func writeDotGraph(outf *os.File, start *node, id string) { 118 done := make(map[*node]bool) 119 var show func(*node) 120 show = func(u *node) { 121 if u.accept { 122 fmt.Fprintf(outf, " %v[style=filled,color=green];\n", u.n) 123 } 124 done[u] = true 125 for _, e := range u.e { 126 // We use -1 to denote the dead end node in DFAs. 127 if e.dst.n == -1 { 128 continue 129 } 130 label := "" 131 runeToDot := func(r rune) string { 132 if strconv.IsPrint(r) { 133 return fmt.Sprintf("%v", string(r)) 134 } 135 return fmt.Sprintf("U+%X", int(r)) 136 } 137 switch e.kind { 138 case kRune: 139 label = fmt.Sprintf("[label=%q]", runeToDot(e.r)) 140 case kWild: 141 label = "[color=blue]" 142 case kClass: 143 label = "[label=\"[" 144 if e.negate { 145 label += "^" 146 } 147 for i := 0; i < len(e.lim); i += 2 { 148 label += runeToDot(e.lim[i]) 149 if e.lim[i] != e.lim[i+1] { 150 label += "-" + runeToDot(e.lim[i+1]) 151 } 152 } 153 label += "]\"]" 154 } 155 fmt.Fprintf(outf, " %v -> %v%v;\n", u.n, e.dst.n, label) 156 } 157 for _, e := range u.e { 158 if !done[e.dst] { 159 show(e.dst) 160 } 161 } 162 } 163 fmt.Fprintf(outf, "digraph %v {\n 0[shape=box];\n", id) 164 show(start) 165 fmt.Fprintln(outf, "}") 166 } 167 168 func inClass(r rune, lim []rune) bool { 169 for i := 0; i < len(lim); i += 2 { 170 if lim[i] <= r && r <= lim[i+1] { 171 return true 172 } 173 } 174 return false 175 } 176 177 var dfadot, nfadot *os.File 178 179 func gen(out *bufio.Writer, x *rule) { 180 s := x.regex 181 // Regex -> NFA 182 // We cannot have our alphabet be all Unicode characters. Instead, 183 // we compute an alphabet for each regex: 184 // 185 // 1. Singles: we add single runes used in the regex: any rune not in a 186 // range. These are held in `sing`. 187 // 188 // 2. Ranges: entire ranges become elements of the alphabet. If ranges in 189 // the same expression overlap, we break them up into non-overlapping 190 // ranges. The generated code checks singles before ranges, so there's no 191 // need to break up a range if it contains a single. These are maintained 192 // in sorted order in `lim`. 193 // 194 // 3. Wild: we add an element representing all other runes. 195 // 196 // e.g. the alphabet of /[0-9]*[Ee][2-5]*/ is sing: { E, e }, 197 // lim: { [0-1], [2-5], [6-9] } and the wild element. 198 sing := make(map[rune]bool) 199 var lim []rune 200 var insertLimits func(l, r rune) 201 // Insert a new range [l-r] into `lim`, breaking it up if it overlaps, and 202 // discarding it if it coincides with an existing range. We keep `lim` 203 // sorted. 204 insertLimits = func(l, r rune) { 205 var i int 206 for i = 0; i < len(lim); i += 2 { 207 if l <= lim[i+1] { 208 break 209 } 210 } 211 if len(lim) == i || r < lim[i] { 212 lim = append(lim, 0, 0) 213 copy(lim[i+2:], lim[i:]) 214 lim[i] = l 215 lim[i+1] = r 216 return 217 } 218 if l < lim[i] { 219 lim = append(lim, 0, 0) 220 copy(lim[i+2:], lim[i:]) 221 lim[i+1] = lim[i] - 1 222 lim[i] = l 223 insertLimits(lim[i], r) 224 return 225 } 226 if l > lim[i] { 227 lim = append(lim, 0, 0) 228 copy(lim[i+2:], lim[i:]) 229 lim[i+1] = l - 1 230 lim[i+2] = l 231 insertLimits(l, r) 232 return 233 } 234 // l == lim[i] 235 if r == lim[i+1] { 236 return 237 } 238 if r < lim[i+1] { 239 lim = append(lim, 0, 0) 240 copy(lim[i+2:], lim[i:]) 241 lim[i] = l 242 lim[i+1] = r 243 lim[i+2] = r + 1 244 return 245 } 246 insertLimits(lim[i+1]+1, r) 247 } 248 pos := 0 249 n := 0 250 newNode := func() *node { 251 res := new(node) 252 res.n = n 253 n++ 254 return res 255 } 256 newEdge := func(u, v *node) *edge { 257 res := new(edge) 258 res.dst = v 259 u.e = append(u.e, res) 260 sort.Sort(u.e) 261 return res 262 } 263 newStartEdge := func(u, v *node) *edge { 264 res := newEdge(u, v) 265 res.kind = kStart 266 return res 267 } 268 newEndEdge := func(u, v *node) *edge { 269 res := newEdge(u, v) 270 res.kind = kEnd 271 return res 272 } 273 newWildEdge := func(u, v *node) *edge { 274 res := newEdge(u, v) 275 res.kind = kWild 276 return res 277 } 278 newRuneEdge := func(u, v *node, r rune) *edge { 279 res := newEdge(u, v) 280 res.kind = kRune 281 res.r = r 282 sing[r] = true 283 return res 284 } 285 newNilEdge := func(u, v *node) *edge { 286 res := newEdge(u, v) 287 res.kind = kNil 288 return res 289 } 290 newClassEdge := func(u, v *node) *edge { 291 res := newEdge(u, v) 292 res.kind = kClass 293 res.lim = make([]rune, 0, 2) 294 return res 295 } 296 maybeEscape := func() rune { 297 c := s[pos] 298 if '\\' == c { 299 pos++ 300 if len(s) == pos { 301 panic(ErrExtraneousBackslash) 302 } 303 c = s[pos] 304 switch { 305 case ispunct(c): 306 case escape(c) >= 0: 307 c = escape(s[pos]) 308 default: 309 panic(ErrBadBackslash) 310 } 311 } 312 return c 313 } 314 pcharclass := func() (start, end *node) { 315 start, end = newNode(), newNode() 316 e := newClassEdge(start, end) 317 // Ranges consisting of a single element are a special case: 318 singletonRange := func(c rune) { 319 // 1. The edge-specific 'lim' field always expects endpoints in pairs, 320 // so we must give 'c' as the beginning and the end of the range. 321 e.lim = append(e.lim, c, c) 322 // 2. Instead of updating the regex-wide 'lim' interval set, we add a singleton. 323 sing[c] = true 324 } 325 if len(s) > pos && '^' == s[pos] { 326 e.negate = true 327 pos++ 328 } 329 var left rune 330 leftLive := false 331 justSawDash := false 332 first := true 333 // Allow '-' at the beginning and end, and in ranges. 334 for pos < len(s) && s[pos] != ']' { 335 switch c := maybeEscape(); c { 336 case '-': 337 if first { 338 singletonRange('-') 339 break 340 } 341 justSawDash = true 342 default: 343 if justSawDash { 344 if !leftLive || left > c { 345 panic(ErrBadRange) 346 } 347 e.lim = append(e.lim, left, c) 348 if left == c { 349 sing[c] = true 350 } else { 351 insertLimits(left, c) 352 } 353 leftLive = false 354 } else { 355 if leftLive { 356 singletonRange(left) 357 } 358 left = c 359 leftLive = true 360 } 361 justSawDash = false 362 } 363 first = false 364 pos++ 365 } 366 if leftLive { 367 singletonRange(left) 368 } 369 if justSawDash { 370 singletonRange('-') 371 } 372 return 373 } 374 isNested := false 375 var pre func() (start, end *node) 376 pterm := func() (start, end *node) { 377 if len(s) == pos || s[pos] == '|' { 378 end = newNode() 379 start = end 380 return 381 } 382 switch s[pos] { 383 case '*', '+', '?': 384 panic(ErrBareClosure) 385 case ')': 386 if !isNested { 387 panic(ErrUnmatchedRpar) 388 } 389 end = newNode() 390 start = end 391 return 392 case '(': 393 pos++ 394 oldIsNested := isNested 395 isNested = true 396 start, end = pre() 397 isNested = oldIsNested 398 if len(s) == pos || ')' != s[pos] { 399 panic(ErrUnmatchedLpar) 400 } 401 case '.': 402 start, end = newNode(), newNode() 403 newWildEdge(start, end) 404 case '^': 405 start, end = newNode(), newNode() 406 newStartEdge(start, end) 407 case '$': 408 start, end = newNode(), newNode() 409 newEndEdge(start, end) 410 case ']': 411 panic(ErrUnmatchedRbkt) 412 case '[': 413 pos++ 414 start, end = pcharclass() 415 if len(s) == pos || ']' != s[pos] { 416 panic(ErrUnmatchedLbkt) 417 } 418 default: 419 start, end = newNode(), newNode() 420 newRuneEdge(start, end, maybeEscape()) 421 } 422 pos++ 423 return 424 } 425 pclosure := func() (start, end *node) { 426 start, end = pterm() 427 if start == end { 428 return 429 } 430 if len(s) == pos { 431 return 432 } 433 switch s[pos] { 434 case '*': 435 newNilEdge(end, start) 436 nend := newNode() 437 newNilEdge(end, nend) 438 start, end = end, nend 439 case '+': 440 newNilEdge(end, start) 441 nend := newNode() 442 newNilEdge(end, nend) 443 end = nend 444 case '?': 445 newNilEdge(start, end) 446 default: 447 return 448 } 449 pos++ 450 return 451 } 452 pcat := func() (start, end *node) { 453 for { 454 nstart, nend := pclosure() 455 if start == nil { 456 start, end = nstart, nend 457 } else if nstart != nend { 458 end.e = make([]*edge, len(nstart.e)) 459 copy(end.e, nstart.e) 460 end = nend 461 } 462 if nstart == nend { 463 return 464 } 465 } 466 panic("unreachable") 467 } 468 pre = func() (start, end *node) { 469 start, end = pcat() 470 for pos < len(s) && s[pos] != ')' { 471 if s[pos] != '|' { 472 panic(ErrInternal) 473 } 474 pos++ 475 nstart, nend := pcat() 476 tmp := newNode() 477 newNilEdge(tmp, start) 478 newNilEdge(tmp, nstart) 479 start = tmp 480 tmp = newNode() 481 newNilEdge(end, tmp) 482 newNilEdge(nend, tmp) 483 end = tmp 484 } 485 return 486 } 487 start, end := pre() 488 end.accept = true 489 490 // Compute shortlist of nodes (reachable nodes), as we may have discarded 491 // nodes left over from parsing. Also, make short[0] the start node. 492 short := make([]*node, 0, n) 493 { 494 var visit func(*node) 495 mark := make([]bool, n) 496 newn := make([]int, n) 497 visit = func(u *node) { 498 mark[u.n] = true 499 newn[u.n] = len(short) 500 short = append(short, u) 501 for _, e := range u.e { 502 if !mark[e.dst.n] { 503 visit(e.dst) 504 } 505 } 506 } 507 visit(start) 508 for _, v := range short { 509 v.n = newn[v.n] 510 } 511 } 512 n = len(short) 513 514 if nfadot != nil { 515 writeDotGraph(nfadot, start, "NFA_"+x.id) 516 } 517 518 // NFA -> DFA 519 nilClose := func(st []bool) { 520 mark := make([]bool, n) 521 var do func(int) 522 do = func(i int) { 523 v := short[i] 524 for _, e := range v.e { 525 if e.kind == kNil && !mark[e.dst.n] { 526 st[e.dst.n] = true 527 do(e.dst.n) 528 } 529 } 530 } 531 for i := 0; i < n; i++ { 532 if st[i] && !mark[i] { 533 mark[i] = true 534 do(i) 535 } 536 } 537 } 538 var todo []*node 539 tab := make(map[string]*node) 540 var buf []byte 541 dfacount := 0 542 { // Construct the node of no return. 543 for i := 0; i < n; i++ { 544 buf = append(buf, '0') 545 } 546 tmp := new(node) 547 tmp.n = -1 548 tab[string(buf)] = tmp 549 } 550 newDFANode := func(st []bool) (res *node, found bool) { 551 buf = nil 552 accept := false 553 for i, v := range st { 554 if v { 555 buf = append(buf, '1') 556 accept = accept || short[i].accept 557 } else { 558 buf = append(buf, '0') 559 } 560 } 561 res, found = tab[string(buf)] 562 if !found { 563 res = new(node) 564 res.n = dfacount 565 res.accept = accept 566 dfacount++ 567 for i, v := range st { 568 if v { 569 res.set = append(res.set, i) 570 } 571 } 572 tab[string(buf)] = res 573 } 574 return res, found 575 } 576 577 get := func(states []bool) *node { 578 nilClose(states) 579 node, old := newDFANode(states) 580 if !old { 581 todo = append(todo, node) 582 } 583 return node 584 } 585 getcb := func(v *node, cb func(*edge) bool) *node { 586 states := make([]bool, n) 587 for _, i := range v.set { 588 for _, e := range short[i].e { 589 if cb(e) { 590 states[e.dst.n] = true 591 } 592 } 593 } 594 return get(states) 595 } 596 states := make([]bool, n) 597 // The DFA start state is the state representing the nil-closure of the start 598 // node in the NFA. Recall it has index 0. 599 states[0] = true 600 dfastart := get(states) 601 for len(todo) > 0 { 602 v := todo[len(todo)-1] 603 todo = todo[0 : len(todo)-1] 604 // Singles. 605 var runes []rune 606 for r, _ := range sing { 607 runes = append(runes, r) 608 } 609 sort.Sort(RuneSlice(runes)) 610 for _, r := range runes { 611 newRuneEdge(v, getcb(v, func(e *edge) bool { 612 return e.kind == kRune && e.r == r || 613 e.kind == kWild || 614 e.kind == kClass && e.negate != inClass(r, e.lim) 615 }), r) 616 } 617 // Character ranges. 618 for j := 0; j < len(lim); j += 2 { 619 e := newClassEdge(v, getcb(v, func(e *edge) bool { 620 return e.kind == kWild || 621 e.kind == kClass && e.negate != inClass(lim[j], e.lim) 622 })) 623 624 e.lim = append(e.lim, lim[j], lim[j+1]) 625 } 626 // Wild. 627 newWildEdge(v, getcb(v, func(e *edge) bool { 628 return e.kind == kWild || (e.kind == kClass && e.negate) 629 })) 630 // ^ and $. 631 newStartEdge(v, getcb(v, func(e *edge) bool { return e.kind == kStart })) 632 newEndEdge(v, getcb(v, func(e *edge) bool { return e.kind == kEnd })) 633 } 634 n = dfacount 635 636 if dfadot != nil { 637 writeDotGraph(dfadot, dfastart, "DFA_"+x.id) 638 } 639 // DFA -> Go 640 sorted := make([]*node, n) 641 for _, v := range tab { 642 if -1 != v.n { 643 sorted[v.n] = v 644 } 645 } 646 647 fmt.Fprintf(out, "\n// %v\n", string(x.regex)) 648 for i, v := range sorted { 649 if i == 0 { 650 out.WriteString("{[]bool{") 651 } else { 652 out.WriteString(", ") 653 } 654 if v.accept { 655 out.WriteString("true") 656 } else { 657 out.WriteString("false") 658 } 659 } 660 out.WriteString("}, []func(rune) int{ // Transitions\n") 661 for _, v := range sorted { 662 out.WriteString("func(r rune) int {\n") 663 var runeCases, classCases string 664 var wildDest int 665 for _, e := range v.e { 666 m := e.dst.n 667 switch e.kind { 668 case kRune: 669 runeCases += fmt.Sprintf("\t\tcase %d: return %d\n", e.r, m) 670 case kClass: 671 classCases += fmt.Sprintf("\t\tcase %d <= r && r <= %d: return %d\n", 672 e.lim[0], e.lim[1], m) 673 case kWild: 674 wildDest = m 675 } 676 } 677 if runeCases != "" { 678 out.WriteString("\tswitch(r) {\n" + runeCases + "\t}\n") 679 } 680 if classCases != "" { 681 out.WriteString("\tswitch {\n" + classCases + "\t}\n") 682 } 683 fmt.Fprintf(out, "\treturn %v\n},\n", wildDest) 684 } 685 out.WriteString("}, []int{ /* Start-of-input transitions */ ") 686 for _, v := range sorted { 687 s := " -1," 688 for _, e := range v.e { 689 if e.kind == kStart { 690 s = fmt.Sprintf(" %d,", e.dst.n) 691 break 692 } 693 } 694 out.WriteString(s) 695 } 696 out.WriteString("}, []int{ /* End-of-input transitions */ ") 697 for _, v := range sorted { 698 s := " -1," 699 for _, e := range v.e { 700 if e.kind == kEnd { 701 s = fmt.Sprintf(" %d,", e.dst.n) 702 break 703 } 704 } 705 out.WriteString(s) 706 } 707 out.WriteString("},") 708 if len(x.kid) == 0 { 709 out.WriteString("nil") 710 } else { 711 out.WriteString("[]dfa{") 712 for _, kid := range x.kid { 713 gen(out, kid) 714 } 715 out.WriteString("}") 716 } 717 out.WriteString("},\n") 718 } 719 720 func writeFamily(out *bufio.Writer, node *rule, lvl int) { 721 tab := func() { 722 for i := 0; i <= lvl; i++ { 723 out.WriteByte('\t') 724 } 725 } 726 if node.startCode != "" { 727 tab() 728 prefixReplacer.WriteString(out, "if !yylex.stale {\n") 729 tab() 730 out.WriteString("\t" + node.startCode + "\n") 731 tab() 732 out.WriteString("}\n") 733 } 734 tab() 735 fmt.Fprintf(out, "OUTER%s%d:\n", node.id, lvl) 736 tab() 737 prefixReplacer.WriteString(out, 738 fmt.Sprintf("for {\nnext:=yylex.next(%v)\nlval.line = yylex.Line()+1\nlval.column = yylex.Column()+1\nswitch next {\n", lvl)) 739 for i, x := range node.kid { 740 tab() 741 fmt.Fprintf(out, "\tcase %d:\n", i) 742 lvl++ 743 if x.kid != nil { 744 writeFamily(out, x, lvl) 745 } else { 746 tab() 747 out.WriteString("\t" + x.code + "\n") 748 } 749 lvl-- 750 } 751 tab() 752 out.WriteString("\tdefault:\n") 753 tab() 754 fmt.Fprintf(out, "\t\t break OUTER%s%d\n", node.id, lvl) 755 tab() 756 out.WriteString("\t}\n") 757 tab() 758 out.WriteString("\tcontinue\n") 759 tab() 760 out.WriteString("}\n") 761 tab() 762 prefixReplacer.WriteString(out, "yylex.pop()\n") 763 tab() 764 out.WriteString(node.endCode + "\n") 765 } 766 767 var lexertext = `import ("bufio";"io";"strings") 768 type frame struct { 769 i int 770 s string 771 line, column int 772 } 773 774 type Lexer struct { 775 // The lexer runs in its own goroutine, and communicates via channel 'ch'. 776 ch chan frame 777 ch_stop chan bool 778 // We record the level of nesting because the action could return, and a 779 // subsequent call expects to pick up where it left off. In other words, 780 // we're simulating a coroutine. 781 // TODO: Support a channel-based variant that compatible with Go's yacc. 782 stack []frame 783 stale bool 784 785 // The 'l' and 'c' fields were added for 786 // https://github.com/wagerlabs/docker/blob/65694e801a7b80930961d70c69cba9f2465459be/buildfile.nex 787 // Now used to record last seen line & column from the stack. 788 l, c int 789 790 parseResult interface{} 791 792 // The following line makes it easy for scripts to insert fields in the 793 // generated code. 794 // [NEX_END_OF_LEXER_STRUCT] 795 } 796 797 // NewLexerWithInit creates a new Lexer object, runs the given callback on it, 798 // then returns it. 799 func NewLexerWithInit(in io.Reader, initFun func(*Lexer)) *Lexer { 800 yylex := new(Lexer) 801 if initFun != nil { 802 initFun(yylex) 803 } 804 yylex.ch = make(chan frame) 805 yylex.ch_stop = make(chan bool, 1) 806 var scan func(in *bufio.Reader, ch chan frame, ch_stop chan bool, family []dfa, line, column int) 807 scan = func(in *bufio.Reader, ch chan frame, ch_stop chan bool, family []dfa, line, column int) { 808 // Index of DFA and length of highest-precedence match so far. 809 matchi, matchn := 0, -1 810 var buf []rune 811 n := 0 812 checkAccept := func(i int, st int) bool { 813 // Higher precedence match? DFAs are run in parallel, so matchn is at most len(buf), hence we may omit the length equality check. 814 if family[i].acc[st] && (matchn < n || matchi > i) { 815 matchi, matchn = i, n 816 return true 817 } 818 return false 819 } 820 stateCap := len(family) 821 if stateCap == 0 { stateCap = 1 } 822 state := make([][2]int, 0, stateCap) 823 for i := 0; i < len(family); i++ { 824 mark := make([]bool, len(family[i].startf)) 825 // Every DFA starts at state 0. 826 st := 0 827 for { 828 state = append(state, [2]int{i, st}) 829 mark[st] = true 830 // As we're at the start of input, follow all ^ transitions and append to our list of start states. 831 st = family[i].startf[st] 832 if -1 == st || mark[st] { break } 833 // We only check for a match after at least one transition. 834 checkAccept(i, st) 835 } 836 } 837 atEOF := false 838 stopped := false 839 840 loop: 841 for { 842 if n == len(buf) && !atEOF { 843 r,_,err := in.ReadRune() 844 switch err { 845 case io.EOF: atEOF = true 846 case nil: buf = append(buf, r) 847 default: panic(err) 848 } 849 } 850 if !atEOF { 851 r := buf[n] 852 n++ 853 d := 0 854 for _, x := range state { 855 x[1] = family[x[0]].f[x[1]](r) 856 if -1 == x[1] { continue } 857 state[d] = x 858 d++ 859 checkAccept(x[0], x[1]) 860 } 861 state = state[:d] 862 } else { 863 dollar: // Handle $. 864 for _, x := range state { 865 mark := make([]bool, len(family[x[0]].endf)) 866 for { 867 mark[x[1]] = true 868 x[1] = family[x[0]].endf[x[1]] 869 if -1 == x[1] || mark[x[1]] { break } 870 if checkAccept(x[0], x[1]) { 871 // Unlike before, we can break off the search. Now that we're at the end, there's no need to maintain the state of each DFA. 872 break dollar 873 } 874 } 875 } 876 state = state[:0] 877 } 878 879 if len(state) == 0 { 880 lcUpdate := func(r rune) { 881 if r == '\n' { 882 line++ 883 column = 0 884 } else { 885 column++ 886 } 887 } 888 // All DFAs stuck. Return last match if it exists, otherwise advance by one rune and restart all DFAs. 889 if matchn == -1 { 890 if len(buf) == 0 { // This can only happen at the end of input. 891 break 892 } 893 lcUpdate(buf[0]) 894 buf = buf[1:] 895 } else { 896 text := string(buf[:matchn]) 897 buf = buf[matchn:] 898 matchn = -1 899 900 select { 901 case <- ch_stop: 902 stopped = true 903 break loop 904 default: 905 } 906 select { 907 case ch <- frame{matchi, text, line, column}: 908 case <- ch_stop: 909 stopped = true 910 break loop 911 } 912 if len(family[matchi].nest) > 0 { 913 scan(bufio.NewReader(strings.NewReader(text)), ch, ch_stop, family[matchi].nest, line, column) 914 } 915 if atEOF { 916 break 917 } 918 for _, r := range text { 919 lcUpdate(r) 920 } 921 } 922 n = 0 923 if len(family) > cap(state) { 924 state = make([][2]int, 0, len(family)) 925 } 926 for i := 0; i < len(family); i++ { 927 state = append(state, [2]int{i, 0}) 928 } 929 } 930 } 931 select { 932 case <- ch_stop: 933 stopped = true 934 default: 935 } 936 if !stopped { 937 select { 938 case ch <- frame{-1, "", line, column}: 939 940 case <- ch_stop: 941 } 942 } 943 } 944 go scan(bufio.NewReader(in), yylex.ch, yylex.ch_stop, dfas, 0, 0) 945 return yylex 946 } 947 948 type dfa struct { 949 acc []bool // Accepting states. 950 f []func(rune) int // Transitions. 951 startf, endf []int // Transitions at start and end of input. 952 nest []dfa 953 } 954 955 var dfas = []dfa{` 956 957 var lexeroutro = `} 958 959 func NewLexer(in io.Reader) *Lexer { 960 return NewLexerWithInit(in, nil) 961 } 962 963 func (yyLex *Lexer) Stop() { 964 select { 965 case yyLex.ch_stop <- true: 966 default: 967 } 968 } 969 970 // Text returns the matched text. 971 func (yylex *Lexer) Text() string { 972 return yylex.stack[len(yylex.stack) - 1].s 973 } 974 975 // Line returns the current line number. 976 // The first line is 0. 977 func (yylex *Lexer) Line() int { 978 if len(yylex.stack) == 0 { 979 return yylex.l 980 } 981 return yylex.stack[len(yylex.stack) - 1].line 982 } 983 984 // Column returns the current column number. 985 // The first column is 0. 986 func (yylex *Lexer) Column() int { 987 if len(yylex.stack) == 0 { 988 return yylex.c 989 } 990 return yylex.stack[len(yylex.stack) - 1].column 991 } 992 993 func (yylex *Lexer) next(lvl int) int { 994 if lvl == len(yylex.stack) { 995 l, c := 0, 0 996 if lvl > 0 { 997 l, c = yylex.stack[lvl - 1].line, yylex.stack[lvl - 1].column 998 } 999 yylex.stack = append(yylex.stack, frame{0, "", l, c}) 1000 } 1001 if lvl == len(yylex.stack) - 1 { 1002 p := &yylex.stack[lvl] 1003 *p = <-yylex.ch 1004 yylex.stale = false 1005 } else { 1006 yylex.stale = true 1007 } 1008 return yylex.stack[lvl].i 1009 } 1010 func (yylex *Lexer) pop() { 1011 l := len(yylex.stack)-1 1012 yylex.l, yylex.c = yylex.stack[l].line, yylex.stack[l].column 1013 yylex.stack = yylex.stack[:l] 1014 } 1015 ` 1016 1017 func writeLex(out *bufio.Writer, root rule) { 1018 if !customError { 1019 // TODO: I can't remember what this was for! 1020 prefixReplacer.WriteString(out, `func (yylex Lexer) Error(e string) { 1021 panic(e) 1022 }`) 1023 } 1024 prefixReplacer.WriteString(out, ` 1025 // Lex runs the lexer. 1026 // When the -s option is given, this function is not generated; 1027 // instead, the NN_FUN macro runs the lexer. 1028 // yySymType is expected to include the int fields, line and column. 1029 func (yylex *Lexer) Lex(lval *yySymType) int { 1030 `) 1031 writeFamily(out, &root, 0) 1032 out.WriteString("\treturn 0\n}\n") 1033 } 1034 func writeNNFun(out *bufio.Writer, root rule) { 1035 prefixReplacer.WriteString(out, "func(yylex *Lexer) {\n") 1036 writeFamily(out, &root, 0) 1037 out.WriteString("}") 1038 } 1039 func process(output io.Writer, input io.Reader) error { 1040 lineno := 1 1041 in := bufio.NewReader(input) 1042 out := bufio.NewWriter(output) 1043 var r rune 1044 read := func() bool { 1045 var err error 1046 r, _, err = in.ReadRune() 1047 if err == io.EOF { 1048 return true 1049 } 1050 if err != nil { 1051 panic(err) 1052 } 1053 if r == '\n' { 1054 lineno++ 1055 } 1056 return false 1057 } 1058 skipws := func() bool { 1059 for !read() { 1060 if strings.IndexRune(" \n\t\r", r) == -1 { 1061 return false 1062 } 1063 } 1064 return true 1065 } 1066 var buf []rune 1067 readCode := func() string { 1068 if '{' != r { 1069 panic(ErrExpectedLBrace) 1070 } 1071 buf = []rune{r} 1072 nesting := 1 1073 for { 1074 if read() { 1075 panic(ErrUnmatchedLBrace) 1076 } 1077 buf = append(buf, r) 1078 if '{' == r { 1079 nesting++ 1080 } else if '}' == r { 1081 nesting-- 1082 if 0 == nesting { 1083 break 1084 } 1085 } 1086 } 1087 return string(buf) 1088 } 1089 var root rule 1090 needRootRAngle := false 1091 var parse func(*rule) error 1092 parse = func(node *rule) error { 1093 for { 1094 panicIf(skipws, ErrUnexpectedEOF) 1095 if '<' == r { 1096 if node != &root || len(node.kid) > 0 { 1097 panic(ErrUnexpectedLAngle) 1098 } 1099 panicIf(skipws, ErrUnexpectedEOF) 1100 node.startCode = readCode() 1101 needRootRAngle = true 1102 continue 1103 } else if '>' == r { 1104 if node == &root { 1105 if !needRootRAngle { 1106 panic(ErrUnmatchedRAngle) 1107 } 1108 } 1109 if skipws() { 1110 return ErrUnexpectedEOF 1111 } 1112 node.endCode = readCode() 1113 return nil 1114 } 1115 delim := r 1116 panicIf(read, ErrUnexpectedEOF) 1117 var regex []rune 1118 for { 1119 if r == delim && (len(regex) == 0 || regex[len(regex)-1] != '\\') { 1120 break 1121 } 1122 if '\n' == r { 1123 return ErrUnexpectedNewline 1124 } 1125 regex = append(regex, r) 1126 panicIf(read, ErrUnexpectedEOF) 1127 } 1128 if "" == string(regex) { 1129 break 1130 } 1131 panicIf(skipws, ErrUnexpectedEOF) 1132 x := new(rule) 1133 x.id = fmt.Sprintf("%d", lineno) 1134 node.kid = append(node.kid, x) 1135 x.regex = make([]rune, len(regex)) 1136 copy(x.regex, regex) 1137 if '<' == r { 1138 panicIf(skipws, ErrUnexpectedEOF) 1139 x.startCode = readCode() 1140 parse(x) 1141 } else { 1142 x.code = readCode() 1143 } 1144 } 1145 return nil 1146 } 1147 err := parse(&root) 1148 if err != nil { 1149 return err 1150 } 1151 1152 buf = nil 1153 for done := skipws(); !done; done = read() { 1154 buf = append(buf, r) 1155 } 1156 fs := token.NewFileSet() 1157 // Append a blank line to make things easier when there are only package and 1158 // import declarations. 1159 t, err := parser.ParseFile(fs, "", string(buf)+"\n", parser.ImportsOnly) 1160 if err != nil { 1161 panic(err) 1162 } 1163 printer.Fprint(out, fs, t) 1164 1165 var file *token.File 1166 fs.Iterate(func(f *token.File) bool { 1167 file = f 1168 return true 1169 }) 1170 1171 // Skip over package and import declarations. This is why we appended a blank 1172 // line above. 1173 for m := file.LineCount(); m > 1; m-- { 1174 i := 0 1175 for '\n' != buf[i] { 1176 i++ 1177 } 1178 buf = buf[i+1:] 1179 } 1180 1181 prefixReplacer.WriteString(out, lexertext) 1182 1183 for _, kid := range root.kid { 1184 gen(out, kid) 1185 } 1186 prefixReplacer.WriteString(out, lexeroutro) 1187 if !standalone { 1188 writeLex(out, root) 1189 out.WriteString(string(buf)) 1190 out.Flush() 1191 if len(outFilename) > 0 { 1192 gofmt() 1193 } 1194 return nil 1195 } 1196 m := 0 1197 const funmac = "NN_FUN" 1198 for m < len(buf) { 1199 m++ 1200 if funmac[:m] != string(buf[:m]) { 1201 out.WriteString(string(buf[:m])) 1202 buf = buf[m:] 1203 m = 0 1204 } else if funmac == string(buf[:m]) { 1205 writeNNFun(out, root) 1206 buf = buf[m:] 1207 m = 0 1208 } 1209 } 1210 out.WriteString(string(buf)) 1211 out.Flush() 1212 if len(outFilename) > 0 { 1213 gofmt() 1214 } 1215 return nil 1216 } 1217 1218 func gofmt() { 1219 src, err := ioutil.ReadFile(outFilename) 1220 if err != nil { 1221 return 1222 } 1223 src, err = format.Source(src) 1224 if err != nil { 1225 return 1226 } 1227 ioutil.WriteFile(outFilename, src, 0666) 1228 } 1229 1230 func panicIf(f func() bool, err error) { 1231 if f() { 1232 panic(err) 1233 } 1234 } 1235 1236 func dieIf(cond bool, v ...interface{}) { 1237 if cond { 1238 log.Fatal(v...) 1239 } 1240 } 1241 1242 func dieErr(err error, s string) { 1243 if err != nil { 1244 log.Fatalf("%v: %v", s, err) 1245 } 1246 } 1247 1248 func createDotFile(filename string) *os.File { 1249 if filename == "" { 1250 return nil 1251 } 1252 suf := strings.HasSuffix(filename, ".nex") 1253 dieIf(suf, "nex: DOT filename ends with .nex:", filename) 1254 file, err := os.Create(filename) 1255 dieErr(err, "Create") 1256 return file 1257 }