github.com/wbrown/gpt_bpe@v0.0.0-20250709161131-1571a6e8ad2d/runetree.go (about) 1 package gpt_bpe 2 3 import ( 4 "regexp/syntax" 5 "strings" 6 "unicode" 7 ) 8 9 type RuneNode struct { 10 rune rune // The rune this node represents. 11 runes []rune // The prior runes that led to this node. 12 terminal bool // If this node is an absolute terminal node. 13 replacement *[]rune // The replacement runes for this node. 14 childs map[rune]*RuneNode // The child nodes. 15 childsArr *[]*RuneNode // The child nodes in an array, for precedence 16 isPrefix bool // Whether this node is a valid prefix match 17 isContractionTree bool // Whether this node is a contraction tree 18 } 19 20 type RuneNodes []*RuneNode 21 22 func runeIsIn(r rune, runes []rune) bool { 23 for _, rr := range runes { 24 if r == rr { 25 return true 26 } 27 } 28 return false 29 } 30 31 func (nodes *RuneNodes) evaluate(r rune) *RuneNode { 32 var idx int 33 var candidate *RuneNode 34 for idx, candidate = range *nodes { 35 36 var isContraction bool 37 if candidate.isContractionTree { 38 isContraction = true 39 } 40 candidate = candidate.evaluate(r) 41 // ' is not a contraction but 's is, 42 // so we don't care about nils if we're in a contraction tree 43 if candidate == nil && isContraction { 44 continue 45 } 46 47 (*nodes)[idx] = candidate 48 49 if candidate != nil && (candidate.terminal || candidate. 50 replacement != nil) { 51 break 52 } 53 } 54 // Clean out any nodes that are no longer valid. 55 for idx = 0; idx < len(*nodes); idx++ { 56 if idx >= len(*nodes) { 57 break 58 } 59 if (*nodes)[idx] == nil { 60 *nodes = append((*nodes)[:idx], (*nodes)[idx+1:]...) 61 idx-- 62 } 63 } 64 return candidate 65 } 66 67 func (node *RuneNode) evaluate(r rune) *RuneNode { 68 // If the node has an array of children, use that. The array exists if the 69 // node has less than 10 children, and is used to speed up the evaluation 70 // of the node. 71 if node.childsArr != nil { 72 children := *node.childsArr 73 for _, child := range children { 74 if child.rune == r { 75 return child 76 } 77 } 78 } else { 79 child, ok := node.childs[r] 80 if ok { 81 return child 82 } 83 } 84 return nil 85 } 86 87 // Represent the tree as a string by traversing the tree, and using tree 88 // characters to represent the tree structure. 89 func (node *RuneNode) string(level int, sb *strings.Builder) { 90 if node == nil { 91 return 92 } 93 sb.WriteRune(node.rune) 94 idx := 0 95 if len(node.childs) == 1 { 96 // Get the only element from the map recursively until we find a node 97 // with more than one child. 98 for r := range node.childs { 99 node.childs[r].string(level, sb) 100 } 101 return 102 } 103 level += 1 104 if node.replacement != nil { 105 sb.WriteString(" -> ") 106 sb.WriteString(string(*node.replacement)) 107 } 108 sb.WriteByte('\n') 109 110 for r := range node.childs { 111 sb.WriteString(strings.Repeat("| ", level-1)) 112 // If we're the last child, then we prepend with a tree terminator. 113 if idx == len(node.childs)-1 { 114 sb.WriteString("└─") 115 } else { 116 sb.WriteString("├─") 117 } 118 node.childs[r].string(level, sb) 119 idx += 1 120 } 121 } 122 123 // Wrapper 124 func (runeTree *RuneNode) String() string { 125 sb := strings.Builder{} 126 runeTree.string(0, &sb) 127 return sb.String() 128 } 129 130 func (runeTree *RuneNode) insertRunes(runes []rune) (node *RuneNode) { 131 node = runeTree 132 keyLen := len(runes) 133 for i := 0; i < keyLen; i++ { 134 r := runes[i] 135 childNode, ok := node.childs[r] 136 if !ok { 137 children := make([]*RuneNode, 0) 138 node.childs[r] = &RuneNode{ 139 rune: r, 140 runes: runes[:i+1], 141 terminal: i == keyLen-1, 142 childs: make(map[rune]*RuneNode, 0), 143 childsArr: &children, 144 isContractionTree: node.isContractionTree, 145 } 146 } else if i == keyLen-1 { 147 childNode.terminal = true 148 } 149 if len(node.childs) > 10 { 150 // If there are more than 10 children, we set the array pointer 151 // to nil, so that we can use the map instead. 152 node.childsArr = nil 153 } else { 154 if node.childsArr == nil { 155 children := make([]*RuneNode, 0) 156 node.childsArr = &children 157 } 158 if len(node.childs) != len(*node.childsArr) { 159 *node.childsArr = append(*node.childsArr, node.childs[r]) 160 } 161 } 162 node = node.childs[r] 163 } 164 return node 165 } 166 167 func NewRuneTree() *RuneNode { 168 return &RuneNode{ 169 runes: []rune{}, 170 childs: make(map[rune]*RuneNode, 0), 171 } 172 } 173 174 // ContractionsTree creates a specialized RuneTree for handling contractions 175 func CreateContractionsTree() *RuneNode { 176 tree := NewRuneTree() 177 contractions := []string{ 178 "'s", "'t", "'re", "'ve", "'m", "'ll", "'d", 179 } 180 // Insert each contraction into the tree 181 for _, c := range contractions { 182 tree.insertRunes([]rune(c)) 183 } 184 tree.isContractionTree = true 185 return tree 186 } 187 188 func (runeTree *RuneNode) InsertReplacementsIntoRuneTree( 189 replacements map[string]string, 190 ) { 191 for k, v := range replacements { 192 keyRunes := []rune(k) 193 valueRunes := []rune(v) 194 keyNode := runeTree.insertRunes(keyRunes) 195 keyNode.replacement = &valueRunes 196 } 197 } 198 199 func CreateReplacementsRuneTree(replacements map[string]string) *RuneNode { 200 runeTree := NewRuneTree() 201 runeTree.isContractionTree = false 202 runeTree.InsertReplacementsIntoRuneTree(replacements) 203 return runeTree 204 } 205 206 func (runeTree *RuneNode) InsertIntoRuneTree(s []string) { 207 for _, k := range s { 208 keyRunes := []rune(k) 209 runeTree.insertRunes(keyRunes) 210 } 211 } 212 213 // Create a new rune tree from an array of strings to match against. 214 func CreateRuneTree(s []string) *RuneNode { 215 runeTree := NewRuneTree() 216 runeTree.isContractionTree = false 217 runeTree.InsertIntoRuneTree(s) 218 return runeTree 219 } 220 221 type rangeTuple struct { 222 start int 223 end int 224 } 225 226 // The AST is given as a []rune where every two runes are the start and end of a range 227 // We want to convert this to a list of rangeTuples for easier handling 228 func ArrayAsRanges(runes []rune) []rangeTuple { 229 // [65 90 97 122 170 170 181 181 186 186 192 214 216 246 248 705 ... 230 // All are pairs of 2, start and end of a range, print as X-Y 231 ranges := make([]rangeTuple, 0) 232 for i := 0; i < len(runes); i += 2 { 233 ranges = append(ranges, rangeTuple{start: int(runes[i]), end: int(runes[i+1])}) 234 } 235 return ranges 236 } 237 238 // We will need to populate a lookup table for the ranges 239 // Once per node. Use binary search to find the rune in the ranges 240 func populateCharRanges(i int, ranges []rangeTuple) bool { 241 // Binary search 242 low, high := 0, len(ranges)-1 243 for low <= high { 244 mid := low + (high-low)/2 245 if ranges[mid].start <= i && i <= ranges[mid].end { 246 return true 247 } 248 if i < ranges[mid].start { 249 high = mid - 1 250 } else { 251 low = mid + 1 252 } 253 } 254 // If we didn't find the rune in the ranges, return false 255 return false 256 } 257 258 type RangeLUT struct { 259 lookup []bool 260 } 261 262 func newRangeLUT(ranges []rangeTuple) *RangeLUT { 263 maxLutSize := ranges[len(ranges)-1].end + 1 264 lut := &RangeLUT{ 265 lookup: make([]bool, maxLutSize), 266 } 267 for i := 0; i < len(lut.lookup); i++ { 268 lut.lookup[i] = populateCharRanges(i, ranges) 269 } 270 return lut 271 } 272 273 // Once we have done it once, we can now use a lookup table to find the rune in the ranges 274 func containsCharInRange(r rune, lut *RangeLUT) bool { 275 if lut != nil && int(r) < len(lut.lookup) { 276 return lut.lookup[int(r)] 277 } else { 278 return false 279 } 280 } 281 282 // Nodes of the regex tree 283 type RegexNode struct { 284 runeArray []rune // The runes this node represents, used in literals and char classes 285 parent *RegexNode // The parent node 286 children []*RegexNode // The child nodes 287 min int // The min number of matches, set previously, used in literals and char classes 288 max int // The max number of matches, set previously, used in literals and char classes 289 flags int // Any flags set on the node, Unused for now 290 lastOp string // The operation of the node prior 291 thisOp string // The operation of the node 292 pathStrings []string // The string representation of the path to this node 293 rangeLUT *RangeLUT // The lookup table for char classes 294 } 295 296 func CreateRegexTree(AST *syntax.Regexp) *RegexNode { 297 // Given a syntax.regexp assumed as the root, create a tree of RegexNodes 298 // We want the info nodes to inform the op nodes of their min/max, flags, and last op 299 300 // Create the root node 301 root := &RegexNode{ 302 runeArray: AST.Rune, 303 parent: nil, 304 children: make([]*RegexNode, 0), 305 min: AST.Min, 306 max: AST.Max, 307 flags: int(AST.Flags), 308 lastOp: AST.Op.String(), 309 thisOp: AST.Op.String(), 310 pathStrings: make([]string, 0), 311 } 312 root.parent = root 313 root.pathStrings = append(root.pathStrings, "(root)") 314 315 // Create the tree 316 ASTPath := make([]string, 0) 317 ASTPath = append(ASTPath, "(root)") 318 root.createTree(AST, ASTPath) 319 320 return root 321 } 322 323 func (rn *RegexNode) createTree(AST *syntax.Regexp, ASTPath []string) { 324 // Create the tree 325 lastOp := AST.Op.String() 326 ASTPath = append(ASTPath, lastOp) 327 328 for _, sub := range AST.Sub { 329 // Create a new node 330 newNode := &RegexNode{ 331 runeArray: sub.Rune, 332 parent: rn, 333 children: make([]*RegexNode, 0), 334 min: sub.Min, 335 max: sub.Max, 336 flags: int(sub.Flags), 337 lastOp: lastOp, 338 thisOp: sub.Op.String(), 339 pathStrings: ASTPath, 340 } 341 if len(sub.Sub) > 0 { 342 newNode.createTree(sub, ASTPath) 343 } 344 rn.children = append(rn.children, newNode) 345 } 346 } 347 348 // We need a path map to know where we are in the tree 349 func (rn *RegexNode) GeneratePathMap() [][]int { 350 var pathMap [][]int 351 generatePathMap(rn, 0, []int{}, &pathMap) 352 return pathMap 353 } 354 355 func generatePathMap( 356 rn *RegexNode, 357 parentIndex int, 358 currentPath []int, 359 pathMap *[][]int, 360 ) { 361 // Generate a map of the tree with dfs 362 currentPath = append(currentPath, parentIndex) 363 364 // If not already in the map, add the current path 365 pathCopy := make([]int, len(currentPath)) 366 copy(pathCopy, currentPath) 367 *pathMap = append(*pathMap, pathCopy) 368 for idx, child := range rn.children { 369 generatePathMap(child, idx, currentPath, pathMap) 370 } 371 372 } 373 374 func (rn *RegexNode) String() string { 375 // Print the tree 376 sb := strings.Builder{} 377 rn.string(0, &sb) 378 return sb.String() 379 } 380 381 func (rn *RegexNode) string(level int, sb *strings.Builder) { 382 if rn == nil { 383 return 384 } 385 if len(rn.runeArray) > 50 { 386 sb.WriteString(string(rn.runeArray[:50])) 387 } else { 388 sb.WriteString(string(rn.runeArray)) 389 } 390 idx := 0 391 if len(rn.children) == 1 { 392 // Get the only element from the map recursively until we find a node 393 // with more than one child. 394 for r := range rn.children { 395 rn.children[r].string(level, sb) 396 } 397 return 398 } 399 level += 1 400 sb.WriteString(" -> ") 401 sb.WriteString(rn.lastOp) 402 sb.WriteByte('\n') 403 404 for r := range rn.children { 405 sb.WriteString(strings.Repeat("| ", level-1)) 406 // If we're the last child, then we prepend with a tree terminator. 407 if idx == len(rn.children)-1 { 408 sb.WriteString("└─") 409 } else { 410 sb.WriteString("├─") 411 } 412 rn.children[r].string(level, sb) 413 idx += 1 414 } 415 } 416 417 // Variables saved during and between traversals 418 type matchVariables struct { 419 matchedWords []string // The words that have been matched 420 subjectRuneArrIndex int // The index of the last rune matched 421 subjectRuneCandidateIndices []int // The indices of the runes that are candidates for matching 422 currentNodeIdx int // The index of the current node in the path map 423 pathMap [][]int // The path map of the tree 424 ParentOp string // The operation of the parent node from where we are 425 minGroupSize int // The minimum number of runes that must be matched 426 maxGroupSize int // The maximum number of runes that can be matched 427 candidateRunes []rune // The runes that are candidates for matching 428 skipUntilNum int // The number of nodes to skip until the next node that isn't a child of the current node 429 rootNode *RegexNode // The root node of the tree 430 endEval bool // Whether we should end the evaluation 431 lastInfoOpLevel int // The level of the last info op, used for resetting group sizes 432 parentMatched bool // The direct parent of the current node has at least one match 433 } 434 435 // We want to take a string and use pre-order traversal to match the string to the tree, in a regex-like fashion 436 // This is much faster than using the regex package. 437 // The input is a pathmap generate from the regex tree, and the runes to match 438 // The output is a list of strings that have been matched 439 func (rn *RegexNode) EvaluateRegexTree(runes []rune, pathMap [][]int) []string { 440 // Init variables 441 var matchVars matchVariables 442 matchVars.matchedWords = make([]string, 0) 443 matchVars.subjectRuneArrIndex = 0 444 matchVars.currentNodeIdx = 0 445 matchVars.minGroupSize = 1 446 matchVars.maxGroupSize = -1 447 matchVars.candidateRunes = make([]rune, 0, 64) 448 matchVars.subjectRuneCandidateIndices = []int{0} 449 matchVars.pathMap = pathMap 450 matchVars.rootNode = rn 451 matchVars.endEval = false 452 matchVars.lastInfoOpLevel = 1 453 454 // Start the traversal 455 for { 456 rn.traverseRegexTree(runes, &matchVars, 0) 457 if matchVars.subjectRuneArrIndex >= len(runes) { 458 break 459 } 460 // Reset for next round 461 matchVars.currentNodeIdx = 0 462 matchVars.minGroupSize = 1 463 matchVars.maxGroupSize = -1 464 matchVars.candidateRunes = matchVars.candidateRunes[:0] 465 matchVars.subjectRuneCandidateIndices[0] = matchVars.subjectRuneArrIndex 466 matchVars.subjectRuneCandidateIndices = matchVars.subjectRuneCandidateIndices[:1] 467 matchVars.skipUntilNum = 0 468 matchVars.endEval = false 469 matchVars.lastInfoOpLevel = 1 470 } 471 472 return matchVars.matchedWords 473 } 474 475 // The recursive function that traverses the tree 476 func (rn *RegexNode) traverseRegexTree( 477 runes []rune, 478 matchVars *matchVariables, 479 level int, 480 ) { 481 // Pre-order traversal of the tree 482 if matchVars.endEval { 483 return 484 } 485 level += 1 486 thisNodeMap := matchVars.pathMap[matchVars.currentNodeIdx] 487 lastNodeMap := make([]int, 0) 488 if matchVars.currentNodeIdx > 0 { 489 lastNodeMap = matchVars.pathMap[matchVars.currentNodeIdx-1] 490 } 491 thisNodeRuneIdx := -1 492 thisNodeRuneParentIdx := 0 493 494 // Check if we are at the branch root and have a accumulated split 495 if len(thisNodeMap) == 2 && len(matchVars.candidateRunes) != 0 { 496 strMatched := string(matchVars.candidateRunes) 497 matchVars.matchedWords = append(matchVars.matchedWords, strMatched) 498 matchVars.subjectRuneArrIndex += len(matchVars.candidateRunes) 499 500 // Finish Round 501 matchVars.endEval = true 502 return 503 } else if len(thisNodeMap) == 2 { 504 // Reset candidate indices if we are bach at the branch root 505 matchVars.subjectRuneCandidateIndices[0] = matchVars.subjectRuneArrIndex 506 matchVars.subjectRuneCandidateIndices = matchVars.subjectRuneCandidateIndices[:1] 507 } else if len(thisNodeMap) != len(lastNodeMap) && len(lastNodeMap) != 0 { 508 // We have either traversed up or down the tree 509 // Reset parent match variable 510 matchVars.parentMatched = false 511 } 512 513 // Evaluate the current node 514 if matchVars.skipUntilNum == 0 { 515 // if the index isn't of the right length, we append the index to the candidate indices 516 if len(matchVars.subjectRuneCandidateIndices) < len(thisNodeMap) { 517 candidateRuneArray := matchVars.subjectRuneCandidateIndices[len(matchVars.subjectRuneCandidateIndices)-1] 518 matchVars.subjectRuneCandidateIndices = append( 519 matchVars.subjectRuneCandidateIndices, candidateRuneArray, 520 ) 521 } else { 522 // Trim to the right length 523 matchVars.subjectRuneCandidateIndices = matchVars.subjectRuneCandidateIndices[:len(thisNodeMap)] 524 } 525 thisNodeRuneIdx = matchVars.subjectRuneCandidateIndices[len(matchVars.subjectRuneCandidateIndices)-1] 526 if len(matchVars.subjectRuneCandidateIndices) > 1 { 527 thisNodeRuneParentIdx = matchVars.subjectRuneCandidateIndices[len(matchVars.subjectRuneCandidateIndices)-2] 528 } 529 530 switch rn.thisOp { 531 case "Alternate": 532 // Nothing needs to happen if we have these nodes here 533 case "Concat": 534 // Nothing needs to happen if we have these nodes here 535 case "Quest": 536 // Set minmax for the next nodes 537 matchVars.minGroupSize = 0 538 matchVars.maxGroupSize = 1 539 matchVars.lastInfoOpLevel = level 540 case "Plus": 541 // Set minmax for the next nodes 542 matchVars.minGroupSize = 1 543 matchVars.maxGroupSize = -1 544 matchVars.lastInfoOpLevel = level 545 case "Repeat": 546 // Set minmax for the next nodes 547 matchVars.minGroupSize = rn.min 548 matchVars.maxGroupSize = rn.max 549 matchVars.lastInfoOpLevel = level 550 case "Star": 551 // Set minmax for the next nodes 552 matchVars.minGroupSize = 0 553 matchVars.maxGroupSize = -1 554 matchVars.lastInfoOpLevel = level 555 case "Literal": 556 // Evaluate the literal 557 caseInsensitiveFlag := false 558 if rn.flags&int(syntax.FoldCase) != 0 { 559 caseInsensitiveFlag = true 560 } 561 matches := 0 562 matchArr := make([]rune, 0) 563 for i := 0; i < len(rn.runeArray); i++ { 564 if thisNodeRuneIdx+i < len(runes) { 565 if rn.runeArray[i] == runes[thisNodeRuneIdx+i] { 566 matches += 1 567 matchArr = append(matchArr, runes[thisNodeRuneIdx+i]) 568 } else { 569 if caseInsensitiveFlag && unicode.IsLetter(rn.runeArray[i]) && unicode.IsLetter(runes[thisNodeRuneIdx+i]) { 570 if rn.runeArray[i] == runes[thisNodeRuneIdx+i]+32 { 571 matches += 1 572 matchArr = append(matchArr, runes[thisNodeRuneIdx+i]) 573 } else if rn.runeArray[i] == runes[thisNodeRuneIdx+i]-32 { 574 matches += 1 575 matchArr = append(matchArr, runes[thisNodeRuneIdx+i]) 576 } else { 577 break 578 } 579 } else { 580 break 581 582 } 583 } 584 } 585 } 586 587 // If we are expecting a non-zero match, set the min group size 588 // to the length of the rune array (literal length) 589 if matchVars.minGroupSize > 0 { 590 matchVars.minGroupSize = len(rn.runeArray) 591 } 592 593 // Matches must be at least min group but can exceed max, will be cut off. 594 if matchVars.minGroupSize == -1 || matches >= matchVars.minGroupSize { 595 if matchVars.maxGroupSize == -1 || matches <= matchVars.maxGroupSize { 596 // Matched 597 matchVars.parentMatched = true 598 if matches != 0 { 599 matchVars.candidateRunes = append( 600 matchVars.candidateRunes, matchArr..., 601 ) 602 thisNodeRuneIdx += matches 603 } 604 } else if matches > matchVars.maxGroupSize { 605 // Matched, but exceeded max 606 // set matches to max 607 matches = matchVars.maxGroupSize 608 if len(matchArr) > matches { 609 matchArr = matchArr[:matches] 610 } 611 matchVars.candidateRunes = append( 612 matchVars.candidateRunes, matchArr..., 613 ) 614 thisNodeRuneIdx += matches 615 matchVars.parentMatched = true 616 } else { 617 // Not matched 618 // If the parent is a concat, this is an AND statement, we should skip sibings 619 hasConcatParent := false 620 for _, path := range rn.pathStrings { 621 if path == "Concat" { 622 hasConcatParent = true 623 break 624 } 625 } 626 627 // Calculate skip length here 628 if hasConcatParent { 629 matchVars.skipUntilNum = calcSkipLength( 630 matchVars.pathMap, matchVars.currentNodeIdx, true, 631 ) 632 matchVars.candidateRunes = matchVars.candidateRunes[:0] 633 // pop one idx 634 matchVars.subjectRuneCandidateIndices = matchVars.subjectRuneCandidateIndices[:len(matchVars.subjectRuneCandidateIndices)-1] 635 } else { 636 matchVars.skipUntilNum = calcSkipLength( 637 matchVars.pathMap, matchVars.currentNodeIdx, false, 638 ) 639 // Reset one idx 640 matchVars.subjectRuneCandidateIndices[len(matchVars.subjectRuneCandidateIndices)-1] = thisNodeRuneParentIdx 641 thisNodeRuneIdx = thisNodeRuneParentIdx 642 } 643 } 644 } else { 645 // Not matched 646 // If the parent is a concat, this is an AND statement, we should skip sibings 647 hasConcatParent := false 648 parentPtr := rn.parent 649 for { 650 if parentPtr == rn { 651 break 652 } 653 if parentPtr.thisOp == "Concat" { 654 hasConcatParent = true 655 break 656 } else if parentPtr.thisOp == "Alternate" { 657 break 658 } else { 659 parentPtr = parentPtr.parent 660 } 661 662 } 663 // If not matched, we don't care about evaluating the 664 // children of the current node (and potentially siblings) 665 if hasConcatParent { 666 matchVars.skipUntilNum = calcSkipLength( 667 matchVars.pathMap, matchVars.currentNodeIdx, true, 668 ) 669 matchVars.candidateRunes = matchVars.candidateRunes[:0] 670 // pop one idx 671 matchVars.subjectRuneCandidateIndices = matchVars.subjectRuneCandidateIndices[:len(matchVars.subjectRuneCandidateIndices)-1] 672 } else { 673 matchVars.skipUntilNum = calcSkipLength( 674 matchVars.pathMap, matchVars.currentNodeIdx, false, 675 ) 676 // Reset one idx 677 matchVars.subjectRuneCandidateIndices[len(matchVars.subjectRuneCandidateIndices)-1] = thisNodeRuneParentIdx 678 thisNodeRuneIdx = thisNodeRuneParentIdx 679 } 680 } 681 case "CharClass": 682 // Evaluate the char class 683 // We generate and use a LUT for the char class as an optimization over directly 684 // checking the ranges. 685 var lut *RangeLUT 686 if rn.rangeLUT == nil { 687 rangesArray := ArrayAsRanges(rn.runeArray) 688 rn.rangeLUT = newRangeLUT(rangesArray) 689 } else { 690 lut = rn.rangeLUT 691 } 692 693 matches := 0 694 for { 695 if thisNodeRuneIdx+matches < len(runes) { 696 if containsCharInRange(runes[thisNodeRuneIdx+matches], lut) { 697 matches += 1 698 } else { 699 break 700 } 701 } else { 702 break 703 } 704 } 705 706 // Must be at least min group but can exceed max, will be cut off. 707 if matchVars.minGroupSize == -1 || matches >= matchVars.minGroupSize { 708 if matchVars.maxGroupSize == -1 || matches <= matchVars.maxGroupSize { 709 // Matched 710 matchVars.parentMatched = true 711 if matches != 0 { 712 matchVars.candidateRunes = append( 713 matchVars.candidateRunes, 714 runes[thisNodeRuneIdx:thisNodeRuneIdx+matches]..., 715 ) 716 thisNodeRuneIdx += matches 717 } 718 } else if matches > matchVars.maxGroupSize { 719 // Matched, but exceeded max 720 // set matches to max 721 matches = matchVars.maxGroupSize 722 matchVars.candidateRunes = append( 723 matchVars.candidateRunes, 724 runes[thisNodeRuneIdx:thisNodeRuneIdx+matches]..., 725 ) 726 thisNodeRuneIdx += matches 727 matchVars.parentMatched = true 728 } else { 729 // Not matched 730 // If the last alt/concat parent was a concat 731 hasConcatParent := false 732 parentPtr := rn.parent 733 for { 734 if parentPtr == rn { 735 break 736 } 737 if parentPtr.thisOp == "Concat" { 738 hasConcatParent = true 739 break 740 } else if parentPtr.thisOp == "Alternate" { 741 break 742 } else { 743 parentPtr = parentPtr.parent 744 } 745 746 } 747 748 // If not matched, we don't care about evaluating the 749 // children of the current node (and potentially siblings) 750 if hasConcatParent { 751 matchVars.skipUntilNum = calcSkipLength( 752 matchVars.pathMap, matchVars.currentNodeIdx, true, 753 ) 754 matchVars.candidateRunes = matchVars.candidateRunes[:0] 755 // pop one idx 756 matchVars.subjectRuneCandidateIndices = matchVars.subjectRuneCandidateIndices[:len(matchVars.subjectRuneCandidateIndices)-1] 757 } else { 758 matchVars.skipUntilNum = calcSkipLength( 759 matchVars.pathMap, matchVars.currentNodeIdx, false, 760 ) 761 // Reset one idx 762 matchVars.subjectRuneCandidateIndices[len(matchVars.subjectRuneCandidateIndices)-1] = thisNodeRuneParentIdx 763 thisNodeRuneIdx = thisNodeRuneParentIdx 764 } 765 } 766 } else { 767 // Not matched 768 // If the parent is a concat, this is an AND statement, we should skip sibings 769 hasConcatParent := false 770 parentPtr := rn.parent 771 for { 772 if parentPtr == rn { 773 break 774 } 775 if parentPtr.thisOp == "Concat" { 776 hasConcatParent = true 777 break 778 } else if parentPtr.thisOp == "Alternate" { 779 break 780 } else { 781 parentPtr = parentPtr.parent 782 } 783 784 } 785 786 // Calculate skip length here 787 if hasConcatParent { 788 matchVars.skipUntilNum = calcSkipLength( 789 matchVars.pathMap, matchVars.currentNodeIdx, true, 790 ) 791 matchVars.candidateRunes = matchVars.candidateRunes[:0] 792 // pop one idx 793 matchVars.subjectRuneCandidateIndices = matchVars.subjectRuneCandidateIndices[:len(matchVars.subjectRuneCandidateIndices)-1] 794 } else { 795 //fmt.Printf("Parent is not concat, skipping children\n") 796 matchVars.skipUntilNum = calcSkipLength( 797 matchVars.pathMap, matchVars.currentNodeIdx, false, 798 ) 799 // Reset one idx 800 matchVars.subjectRuneCandidateIndices[len(matchVars.subjectRuneCandidateIndices)-1] = thisNodeRuneParentIdx 801 thisNodeRuneIdx = thisNodeRuneParentIdx 802 } 803 } 804 805 default: 806 // Do nothing if we don't find the operation 807 808 } 809 } else { 810 // Decrement the skip until num 811 matchVars.skipUntilNum -= 1 812 } 813 814 // Reset min/max if there is no path to a min/max setting node 815 found := false 816 if level > matchVars.lastInfoOpLevel { 817 matchVars.lastInfoOpLevel = level 818 } 819 820 if matchVars.minGroupSize == 1 && matchVars.maxGroupSize == -1 { 821 found = true 822 } else if matchVars.lastInfoOpLevel != 1 { 823 found = true 824 } 825 826 if !found { 827 matchVars.minGroupSize = 1 828 matchVars.maxGroupSize = -1 829 } 830 831 // Update the rune candidate idx. If theres not a Alternate,we update the parent 832 if thisNodeRuneIdx != -1 { 833 parentOp := rn.parent.thisOp 834 if parentOp == "Quest" || parentOp == "Plus" || parentOp == "Repeat" || parentOp == "Star" { 835 if len(matchVars.subjectRuneCandidateIndices) > 1 { 836 matchVars.subjectRuneCandidateIndices[len(matchVars.subjectRuneCandidateIndices)-2] = thisNodeRuneIdx 837 } 838 } 839 matchVars.subjectRuneCandidateIndices[len(matchVars.subjectRuneCandidateIndices)-1] = thisNodeRuneIdx 840 } 841 842 // Load info from the current node 843 matchVars.currentNodeIdx += 1 844 // If next node is a branch root, and this node is a failed match, we want to actively clear the candidate runes 845 flagNextNodeIsBranchRoot := false 846 if matchVars.currentNodeIdx < len(matchVars.pathMap) && len(matchVars.pathMap[matchVars.currentNodeIdx]) == 2 { 847 flagNextNodeIsBranchRoot = true 848 } 849 if flagNextNodeIsBranchRoot && len(matchVars.candidateRunes) != 0 && !matchVars.parentMatched { 850 matchVars.candidateRunes = matchVars.candidateRunes[:0] 851 } 852 // Traverse the children 853 for _, child := range rn.children { 854 child.traverseRegexTree(runes, matchVars, level) 855 } 856 857 } 858 859 // Given current index, find the next index that isn't a child of the current index 860 // If skipSiblings is true, we skip all siblings of the current node as well 861 // Return the number of nodes between the current node and the next node that isn't a child of the current node 862 func calcSkipLength(mapOfTree [][]int, currentPos int, skipSiblings bool) int { 863 // Get the current path 864 currentPath := mapOfTree[currentPos] 865 lenOfCurrentPath := len(currentPath) 866 skipLength := 0 867 for { 868 // Check if we are at end of map 869 if currentPos == len(mapOfTree)-1 { 870 break 871 } 872 // Check if we are at root 873 if len(mapOfTree[currentPos]) == 1 { 874 break 875 } 876 877 // Siblings are on the same length, if we want to skip siblings, we only check for lesser length 878 if skipSiblings { 879 if len(mapOfTree[currentPos+1]) < lenOfCurrentPath { 880 break 881 } else { 882 currentPos += 1 883 } 884 } else { 885 if len(mapOfTree[currentPos+1]) <= lenOfCurrentPath { 886 break 887 } else { 888 currentPos += 1 889 } 890 } 891 892 skipLength += 1 893 } 894 return skipLength 895 }