golang.org/x/text@v0.14.0/unicode/bidi/core.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package bidi 6 7 import ( 8 "fmt" 9 "log" 10 ) 11 12 // This implementation is a port based on the reference implementation found at: 13 // https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/ 14 // 15 // described in Unicode Bidirectional Algorithm (UAX #9). 16 // 17 // Input: 18 // There are two levels of input to the algorithm, since clients may prefer to 19 // supply some information from out-of-band sources rather than relying on the 20 // default behavior. 21 // 22 // - Bidi class array 23 // - Bidi class array, with externally supplied base line direction 24 // 25 // Output: 26 // Output is separated into several stages: 27 // 28 // - levels array over entire paragraph 29 // - reordering array over entire paragraph 30 // - levels array over line 31 // - reordering array over line 32 // 33 // Note that for conformance to the Unicode Bidirectional Algorithm, 34 // implementations are only required to generate correct reordering and 35 // character directionality (odd or even levels) over a line. Generating 36 // identical level arrays over a line is not required. Bidi explicit format 37 // codes (LRE, RLE, LRO, RLO, PDF) and BN can be assigned arbitrary levels and 38 // positions as long as the rest of the input is properly reordered. 39 // 40 // As the algorithm is defined to operate on a single paragraph at a time, this 41 // implementation is written to handle single paragraphs. Thus rule P1 is 42 // presumed by this implementation-- the data provided to the implementation is 43 // assumed to be a single paragraph, and either contains no 'B' codes, or a 44 // single 'B' code at the end of the input. 'B' is allowed as input to 45 // illustrate how the algorithm assigns it a level. 46 // 47 // Also note that rules L3 and L4 depend on the rendering engine that uses the 48 // result of the bidi algorithm. This implementation assumes that the rendering 49 // engine expects combining marks in visual order (e.g. to the left of their 50 // base character in RTL runs) and that it adjusts the glyphs used to render 51 // mirrored characters that are in RTL runs so that they render appropriately. 52 53 // level is the embedding level of a character. Even embedding levels indicate 54 // left-to-right order and odd levels indicate right-to-left order. The special 55 // level of -1 is reserved for undefined order. 56 type level int8 57 58 const implicitLevel level = -1 59 60 // in returns if x is equal to any of the values in set. 61 func (c Class) in(set ...Class) bool { 62 for _, s := range set { 63 if c == s { 64 return true 65 } 66 } 67 return false 68 } 69 70 // A paragraph contains the state of a paragraph. 71 type paragraph struct { 72 initialTypes []Class 73 74 // Arrays of properties needed for paired bracket evaluation in N0 75 pairTypes []bracketType // paired Bracket types for paragraph 76 pairValues []rune // rune for opening bracket or pbOpen and pbClose; 0 for pbNone 77 78 embeddingLevel level // default: = implicitLevel; 79 80 // at the paragraph levels 81 resultTypes []Class 82 resultLevels []level 83 84 // Index of matching PDI for isolate initiator characters. For other 85 // characters, the value of matchingPDI will be set to -1. For isolate 86 // initiators with no matching PDI, matchingPDI will be set to the length of 87 // the input string. 88 matchingPDI []int 89 90 // Index of matching isolate initiator for PDI characters. For other 91 // characters, and for PDIs with no matching isolate initiator, the value of 92 // matchingIsolateInitiator will be set to -1. 93 matchingIsolateInitiator []int 94 } 95 96 // newParagraph initializes a paragraph. The user needs to supply a few arrays 97 // corresponding to the preprocessed text input. The types correspond to the 98 // Unicode BiDi classes for each rune. pairTypes indicates the bracket type for 99 // each rune. pairValues provides a unique bracket class identifier for each 100 // rune (suggested is the rune of the open bracket for opening and matching 101 // close brackets, after normalization). The embedding levels are optional, but 102 // may be supplied to encode embedding levels of styled text. 103 func newParagraph(types []Class, pairTypes []bracketType, pairValues []rune, levels level) (*paragraph, error) { 104 var err error 105 if err = validateTypes(types); err != nil { 106 return nil, err 107 } 108 if err = validatePbTypes(pairTypes); err != nil { 109 return nil, err 110 } 111 if err = validatePbValues(pairValues, pairTypes); err != nil { 112 return nil, err 113 } 114 if err = validateParagraphEmbeddingLevel(levels); err != nil { 115 return nil, err 116 } 117 118 p := ¶graph{ 119 initialTypes: append([]Class(nil), types...), 120 embeddingLevel: levels, 121 122 pairTypes: pairTypes, 123 pairValues: pairValues, 124 125 resultTypes: append([]Class(nil), types...), 126 } 127 p.run() 128 return p, nil 129 } 130 131 func (p *paragraph) Len() int { return len(p.initialTypes) } 132 133 // The algorithm. Does not include line-based processing (Rules L1, L2). 134 // These are applied later in the line-based phase of the algorithm. 135 func (p *paragraph) run() { 136 p.determineMatchingIsolates() 137 138 // 1) determining the paragraph level 139 // Rule P1 is the requirement for entering this algorithm. 140 // Rules P2, P3. 141 // If no externally supplied paragraph embedding level, use default. 142 if p.embeddingLevel == implicitLevel { 143 p.embeddingLevel = p.determineParagraphEmbeddingLevel(0, p.Len()) 144 } 145 146 // Initialize result levels to paragraph embedding level. 147 p.resultLevels = make([]level, p.Len()) 148 setLevels(p.resultLevels, p.embeddingLevel) 149 150 // 2) Explicit levels and directions 151 // Rules X1-X8. 152 p.determineExplicitEmbeddingLevels() 153 154 // Rule X9. 155 // We do not remove the embeddings, the overrides, the PDFs, and the BNs 156 // from the string explicitly. But they are not copied into isolating run 157 // sequences when they are created, so they are removed for all 158 // practical purposes. 159 160 // Rule X10. 161 // Run remainder of algorithm one isolating run sequence at a time 162 for _, seq := range p.determineIsolatingRunSequences() { 163 // 3) resolving weak types 164 // Rules W1-W7. 165 seq.resolveWeakTypes() 166 167 // 4a) resolving paired brackets 168 // Rule N0 169 resolvePairedBrackets(seq) 170 171 // 4b) resolving neutral types 172 // Rules N1-N3. 173 seq.resolveNeutralTypes() 174 175 // 5) resolving implicit embedding levels 176 // Rules I1, I2. 177 seq.resolveImplicitLevels() 178 179 // Apply the computed levels and types 180 seq.applyLevelsAndTypes() 181 } 182 183 // Assign appropriate levels to 'hide' LREs, RLEs, LROs, RLOs, PDFs, and 184 // BNs. This is for convenience, so the resulting level array will have 185 // a value for every character. 186 p.assignLevelsToCharactersRemovedByX9() 187 } 188 189 // determineMatchingIsolates determines the matching PDI for each isolate 190 // initiator and vice versa. 191 // 192 // Definition BD9. 193 // 194 // At the end of this function: 195 // 196 // - The member variable matchingPDI is set to point to the index of the 197 // matching PDI character for each isolate initiator character. If there is 198 // no matching PDI, it is set to the length of the input text. For other 199 // characters, it is set to -1. 200 // - The member variable matchingIsolateInitiator is set to point to the 201 // index of the matching isolate initiator character for each PDI character. 202 // If there is no matching isolate initiator, or the character is not a PDI, 203 // it is set to -1. 204 func (p *paragraph) determineMatchingIsolates() { 205 p.matchingPDI = make([]int, p.Len()) 206 p.matchingIsolateInitiator = make([]int, p.Len()) 207 208 for i := range p.matchingIsolateInitiator { 209 p.matchingIsolateInitiator[i] = -1 210 } 211 212 for i := range p.matchingPDI { 213 p.matchingPDI[i] = -1 214 215 if t := p.resultTypes[i]; t.in(LRI, RLI, FSI) { 216 depthCounter := 1 217 for j := i + 1; j < p.Len(); j++ { 218 if u := p.resultTypes[j]; u.in(LRI, RLI, FSI) { 219 depthCounter++ 220 } else if u == PDI { 221 if depthCounter--; depthCounter == 0 { 222 p.matchingPDI[i] = j 223 p.matchingIsolateInitiator[j] = i 224 break 225 } 226 } 227 } 228 if p.matchingPDI[i] == -1 { 229 p.matchingPDI[i] = p.Len() 230 } 231 } 232 } 233 } 234 235 // determineParagraphEmbeddingLevel reports the resolved paragraph direction of 236 // the substring limited by the given range [start, end). 237 // 238 // Determines the paragraph level based on rules P2, P3. This is also used 239 // in rule X5c to find if an FSI should resolve to LRI or RLI. 240 func (p *paragraph) determineParagraphEmbeddingLevel(start, end int) level { 241 var strongType Class = unknownClass 242 243 // Rule P2. 244 for i := start; i < end; i++ { 245 if t := p.resultTypes[i]; t.in(L, AL, R) { 246 strongType = t 247 break 248 } else if t.in(FSI, LRI, RLI) { 249 i = p.matchingPDI[i] // skip over to the matching PDI 250 if i > end { 251 log.Panic("assert (i <= end)") 252 } 253 } 254 } 255 // Rule P3. 256 switch strongType { 257 case unknownClass: // none found 258 // default embedding level when no strong types found is 0. 259 return 0 260 case L: 261 return 0 262 default: // AL, R 263 return 1 264 } 265 } 266 267 const maxDepth = 125 268 269 // This stack will store the embedding levels and override and isolated 270 // statuses 271 type directionalStatusStack struct { 272 stackCounter int 273 embeddingLevelStack [maxDepth + 1]level 274 overrideStatusStack [maxDepth + 1]Class 275 isolateStatusStack [maxDepth + 1]bool 276 } 277 278 func (s *directionalStatusStack) empty() { s.stackCounter = 0 } 279 func (s *directionalStatusStack) pop() { s.stackCounter-- } 280 func (s *directionalStatusStack) depth() int { return s.stackCounter } 281 282 func (s *directionalStatusStack) push(level level, overrideStatus Class, isolateStatus bool) { 283 s.embeddingLevelStack[s.stackCounter] = level 284 s.overrideStatusStack[s.stackCounter] = overrideStatus 285 s.isolateStatusStack[s.stackCounter] = isolateStatus 286 s.stackCounter++ 287 } 288 289 func (s *directionalStatusStack) lastEmbeddingLevel() level { 290 return s.embeddingLevelStack[s.stackCounter-1] 291 } 292 293 func (s *directionalStatusStack) lastDirectionalOverrideStatus() Class { 294 return s.overrideStatusStack[s.stackCounter-1] 295 } 296 297 func (s *directionalStatusStack) lastDirectionalIsolateStatus() bool { 298 return s.isolateStatusStack[s.stackCounter-1] 299 } 300 301 // Determine explicit levels using rules X1 - X8 302 func (p *paragraph) determineExplicitEmbeddingLevels() { 303 var stack directionalStatusStack 304 var overflowIsolateCount, overflowEmbeddingCount, validIsolateCount int 305 306 // Rule X1. 307 stack.push(p.embeddingLevel, ON, false) 308 309 for i, t := range p.resultTypes { 310 // Rules X2, X3, X4, X5, X5a, X5b, X5c 311 switch t { 312 case RLE, LRE, RLO, LRO, RLI, LRI, FSI: 313 isIsolate := t.in(RLI, LRI, FSI) 314 isRTL := t.in(RLE, RLO, RLI) 315 316 // override if this is an FSI that resolves to RLI 317 if t == FSI { 318 isRTL = (p.determineParagraphEmbeddingLevel(i+1, p.matchingPDI[i]) == 1) 319 } 320 if isIsolate { 321 p.resultLevels[i] = stack.lastEmbeddingLevel() 322 if stack.lastDirectionalOverrideStatus() != ON { 323 p.resultTypes[i] = stack.lastDirectionalOverrideStatus() 324 } 325 } 326 327 var newLevel level 328 if isRTL { 329 // least greater odd 330 newLevel = (stack.lastEmbeddingLevel() + 1) | 1 331 } else { 332 // least greater even 333 newLevel = (stack.lastEmbeddingLevel() + 2) &^ 1 334 } 335 336 if newLevel <= maxDepth && overflowIsolateCount == 0 && overflowEmbeddingCount == 0 { 337 if isIsolate { 338 validIsolateCount++ 339 } 340 // Push new embedding level, override status, and isolated 341 // status. 342 // No check for valid stack counter, since the level check 343 // suffices. 344 switch t { 345 case LRO: 346 stack.push(newLevel, L, isIsolate) 347 case RLO: 348 stack.push(newLevel, R, isIsolate) 349 default: 350 stack.push(newLevel, ON, isIsolate) 351 } 352 // Not really part of the spec 353 if !isIsolate { 354 p.resultLevels[i] = newLevel 355 } 356 } else { 357 // This is an invalid explicit formatting character, 358 // so apply the "Otherwise" part of rules X2-X5b. 359 if isIsolate { 360 overflowIsolateCount++ 361 } else { // !isIsolate 362 if overflowIsolateCount == 0 { 363 overflowEmbeddingCount++ 364 } 365 } 366 } 367 368 // Rule X6a 369 case PDI: 370 if overflowIsolateCount > 0 { 371 overflowIsolateCount-- 372 } else if validIsolateCount == 0 { 373 // do nothing 374 } else { 375 overflowEmbeddingCount = 0 376 for !stack.lastDirectionalIsolateStatus() { 377 stack.pop() 378 } 379 stack.pop() 380 validIsolateCount-- 381 } 382 p.resultLevels[i] = stack.lastEmbeddingLevel() 383 384 // Rule X7 385 case PDF: 386 // Not really part of the spec 387 p.resultLevels[i] = stack.lastEmbeddingLevel() 388 389 if overflowIsolateCount > 0 { 390 // do nothing 391 } else if overflowEmbeddingCount > 0 { 392 overflowEmbeddingCount-- 393 } else if !stack.lastDirectionalIsolateStatus() && stack.depth() >= 2 { 394 stack.pop() 395 } 396 397 case B: // paragraph separator. 398 // Rule X8. 399 400 // These values are reset for clarity, in this implementation B 401 // can only occur as the last code in the array. 402 stack.empty() 403 overflowIsolateCount = 0 404 overflowEmbeddingCount = 0 405 validIsolateCount = 0 406 p.resultLevels[i] = p.embeddingLevel 407 408 default: 409 p.resultLevels[i] = stack.lastEmbeddingLevel() 410 if stack.lastDirectionalOverrideStatus() != ON { 411 p.resultTypes[i] = stack.lastDirectionalOverrideStatus() 412 } 413 } 414 } 415 } 416 417 type isolatingRunSequence struct { 418 p *paragraph 419 420 indexes []int // indexes to the original string 421 422 types []Class // type of each character using the index 423 resolvedLevels []level // resolved levels after application of rules 424 level level 425 sos, eos Class 426 } 427 428 func (i *isolatingRunSequence) Len() int { return len(i.indexes) } 429 430 func maxLevel(a, b level) level { 431 if a > b { 432 return a 433 } 434 return b 435 } 436 437 // Rule X10, second bullet: Determine the start-of-sequence (sos) and end-of-sequence (eos) types, 438 // either L or R, for each isolating run sequence. 439 func (p *paragraph) isolatingRunSequence(indexes []int) *isolatingRunSequence { 440 length := len(indexes) 441 types := make([]Class, length) 442 for i, x := range indexes { 443 types[i] = p.resultTypes[x] 444 } 445 446 // assign level, sos and eos 447 prevChar := indexes[0] - 1 448 for prevChar >= 0 && isRemovedByX9(p.initialTypes[prevChar]) { 449 prevChar-- 450 } 451 prevLevel := p.embeddingLevel 452 if prevChar >= 0 { 453 prevLevel = p.resultLevels[prevChar] 454 } 455 456 var succLevel level 457 lastType := types[length-1] 458 if lastType.in(LRI, RLI, FSI) { 459 succLevel = p.embeddingLevel 460 } else { 461 // the first character after the end of run sequence 462 limit := indexes[length-1] + 1 463 for ; limit < p.Len() && isRemovedByX9(p.initialTypes[limit]); limit++ { 464 465 } 466 succLevel = p.embeddingLevel 467 if limit < p.Len() { 468 succLevel = p.resultLevels[limit] 469 } 470 } 471 level := p.resultLevels[indexes[0]] 472 return &isolatingRunSequence{ 473 p: p, 474 indexes: indexes, 475 types: types, 476 level: level, 477 sos: typeForLevel(maxLevel(prevLevel, level)), 478 eos: typeForLevel(maxLevel(succLevel, level)), 479 } 480 } 481 482 // Resolving weak types Rules W1-W7. 483 // 484 // Note that some weak types (EN, AN) remain after this processing is 485 // complete. 486 func (s *isolatingRunSequence) resolveWeakTypes() { 487 488 // on entry, only these types remain 489 s.assertOnly(L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM, LRI, RLI, FSI, PDI) 490 491 // Rule W1. 492 // Changes all NSMs. 493 precedingCharacterType := s.sos 494 for i, t := range s.types { 495 if t == NSM { 496 s.types[i] = precedingCharacterType 497 } else { 498 // if t.in(LRI, RLI, FSI, PDI) { 499 // precedingCharacterType = ON 500 // } 501 precedingCharacterType = t 502 } 503 } 504 505 // Rule W2. 506 // EN does not change at the start of the run, because sos != AL. 507 for i, t := range s.types { 508 if t == EN { 509 for j := i - 1; j >= 0; j-- { 510 if t := s.types[j]; t.in(L, R, AL) { 511 if t == AL { 512 s.types[i] = AN 513 } 514 break 515 } 516 } 517 } 518 } 519 520 // Rule W3. 521 for i, t := range s.types { 522 if t == AL { 523 s.types[i] = R 524 } 525 } 526 527 // Rule W4. 528 // Since there must be values on both sides for this rule to have an 529 // effect, the scan skips the first and last value. 530 // 531 // Although the scan proceeds left to right, and changes the type 532 // values in a way that would appear to affect the computations 533 // later in the scan, there is actually no problem. A change in the 534 // current value can only affect the value to its immediate right, 535 // and only affect it if it is ES or CS. But the current value can 536 // only change if the value to its right is not ES or CS. Thus 537 // either the current value will not change, or its change will have 538 // no effect on the remainder of the analysis. 539 540 for i := 1; i < s.Len()-1; i++ { 541 t := s.types[i] 542 if t == ES || t == CS { 543 prevSepType := s.types[i-1] 544 succSepType := s.types[i+1] 545 if prevSepType == EN && succSepType == EN { 546 s.types[i] = EN 547 } else if s.types[i] == CS && prevSepType == AN && succSepType == AN { 548 s.types[i] = AN 549 } 550 } 551 } 552 553 // Rule W5. 554 for i, t := range s.types { 555 if t == ET { 556 // locate end of sequence 557 runStart := i 558 runEnd := s.findRunLimit(runStart, ET) 559 560 // check values at ends of sequence 561 t := s.sos 562 if runStart > 0 { 563 t = s.types[runStart-1] 564 } 565 if t != EN { 566 t = s.eos 567 if runEnd < len(s.types) { 568 t = s.types[runEnd] 569 } 570 } 571 if t == EN { 572 setTypes(s.types[runStart:runEnd], EN) 573 } 574 // continue at end of sequence 575 i = runEnd 576 } 577 } 578 579 // Rule W6. 580 for i, t := range s.types { 581 if t.in(ES, ET, CS) { 582 s.types[i] = ON 583 } 584 } 585 586 // Rule W7. 587 for i, t := range s.types { 588 if t == EN { 589 // set default if we reach start of run 590 prevStrongType := s.sos 591 for j := i - 1; j >= 0; j-- { 592 t = s.types[j] 593 if t == L || t == R { // AL's have been changed to R 594 prevStrongType = t 595 break 596 } 597 } 598 if prevStrongType == L { 599 s.types[i] = L 600 } 601 } 602 } 603 } 604 605 // 6) resolving neutral types Rules N1-N2. 606 func (s *isolatingRunSequence) resolveNeutralTypes() { 607 608 // on entry, only these types can be in resultTypes 609 s.assertOnly(L, R, EN, AN, B, S, WS, ON, RLI, LRI, FSI, PDI) 610 611 for i, t := range s.types { 612 switch t { 613 case WS, ON, B, S, RLI, LRI, FSI, PDI: 614 // find bounds of run of neutrals 615 runStart := i 616 runEnd := s.findRunLimit(runStart, B, S, WS, ON, RLI, LRI, FSI, PDI) 617 618 // determine effective types at ends of run 619 var leadType, trailType Class 620 621 // Note that the character found can only be L, R, AN, or 622 // EN. 623 if runStart == 0 { 624 leadType = s.sos 625 } else { 626 leadType = s.types[runStart-1] 627 if leadType.in(AN, EN) { 628 leadType = R 629 } 630 } 631 if runEnd == len(s.types) { 632 trailType = s.eos 633 } else { 634 trailType = s.types[runEnd] 635 if trailType.in(AN, EN) { 636 trailType = R 637 } 638 } 639 640 var resolvedType Class 641 if leadType == trailType { 642 // Rule N1. 643 resolvedType = leadType 644 } else { 645 // Rule N2. 646 // Notice the embedding level of the run is used, not 647 // the paragraph embedding level. 648 resolvedType = typeForLevel(s.level) 649 } 650 651 setTypes(s.types[runStart:runEnd], resolvedType) 652 653 // skip over run of (former) neutrals 654 i = runEnd 655 } 656 } 657 } 658 659 func setLevels(levels []level, newLevel level) { 660 for i := range levels { 661 levels[i] = newLevel 662 } 663 } 664 665 func setTypes(types []Class, newType Class) { 666 for i := range types { 667 types[i] = newType 668 } 669 } 670 671 // 7) resolving implicit embedding levels Rules I1, I2. 672 func (s *isolatingRunSequence) resolveImplicitLevels() { 673 674 // on entry, only these types can be in resultTypes 675 s.assertOnly(L, R, EN, AN) 676 677 s.resolvedLevels = make([]level, len(s.types)) 678 setLevels(s.resolvedLevels, s.level) 679 680 if (s.level & 1) == 0 { // even level 681 for i, t := range s.types { 682 // Rule I1. 683 if t == L { 684 // no change 685 } else if t == R { 686 s.resolvedLevels[i] += 1 687 } else { // t == AN || t == EN 688 s.resolvedLevels[i] += 2 689 } 690 } 691 } else { // odd level 692 for i, t := range s.types { 693 // Rule I2. 694 if t == R { 695 // no change 696 } else { // t == L || t == AN || t == EN 697 s.resolvedLevels[i] += 1 698 } 699 } 700 } 701 } 702 703 // Applies the levels and types resolved in rules W1-I2 to the 704 // resultLevels array. 705 func (s *isolatingRunSequence) applyLevelsAndTypes() { 706 for i, x := range s.indexes { 707 s.p.resultTypes[x] = s.types[i] 708 s.p.resultLevels[x] = s.resolvedLevels[i] 709 } 710 } 711 712 // Return the limit of the run consisting only of the types in validSet 713 // starting at index. This checks the value at index, and will return 714 // index if that value is not in validSet. 715 func (s *isolatingRunSequence) findRunLimit(index int, validSet ...Class) int { 716 loop: 717 for ; index < len(s.types); index++ { 718 t := s.types[index] 719 for _, valid := range validSet { 720 if t == valid { 721 continue loop 722 } 723 } 724 return index // didn't find a match in validSet 725 } 726 return len(s.types) 727 } 728 729 // Algorithm validation. Assert that all values in types are in the 730 // provided set. 731 func (s *isolatingRunSequence) assertOnly(codes ...Class) { 732 loop: 733 for i, t := range s.types { 734 for _, c := range codes { 735 if t == c { 736 continue loop 737 } 738 } 739 log.Panicf("invalid bidi code %v present in assertOnly at position %d", t, s.indexes[i]) 740 } 741 } 742 743 // determineLevelRuns returns an array of level runs. Each level run is 744 // described as an array of indexes into the input string. 745 // 746 // Determines the level runs. Rule X9 will be applied in determining the 747 // runs, in the way that makes sure the characters that are supposed to be 748 // removed are not included in the runs. 749 func (p *paragraph) determineLevelRuns() [][]int { 750 run := []int{} 751 allRuns := [][]int{} 752 currentLevel := implicitLevel 753 754 for i := range p.initialTypes { 755 if !isRemovedByX9(p.initialTypes[i]) { 756 if p.resultLevels[i] != currentLevel { 757 // we just encountered a new run; wrap up last run 758 if currentLevel >= 0 { // only wrap it up if there was a run 759 allRuns = append(allRuns, run) 760 run = nil 761 } 762 // Start new run 763 currentLevel = p.resultLevels[i] 764 } 765 run = append(run, i) 766 } 767 } 768 // Wrap up the final run, if any 769 if len(run) > 0 { 770 allRuns = append(allRuns, run) 771 } 772 return allRuns 773 } 774 775 // Definition BD13. Determine isolating run sequences. 776 func (p *paragraph) determineIsolatingRunSequences() []*isolatingRunSequence { 777 levelRuns := p.determineLevelRuns() 778 779 // Compute the run that each character belongs to 780 runForCharacter := make([]int, p.Len()) 781 for i, run := range levelRuns { 782 for _, index := range run { 783 runForCharacter[index] = i 784 } 785 } 786 787 sequences := []*isolatingRunSequence{} 788 789 var currentRunSequence []int 790 791 for _, run := range levelRuns { 792 first := run[0] 793 if p.initialTypes[first] != PDI || p.matchingIsolateInitiator[first] == -1 { 794 currentRunSequence = nil 795 // int run = i; 796 for { 797 // Copy this level run into currentRunSequence 798 currentRunSequence = append(currentRunSequence, run...) 799 800 last := currentRunSequence[len(currentRunSequence)-1] 801 lastT := p.initialTypes[last] 802 if lastT.in(LRI, RLI, FSI) && p.matchingPDI[last] != p.Len() { 803 run = levelRuns[runForCharacter[p.matchingPDI[last]]] 804 } else { 805 break 806 } 807 } 808 sequences = append(sequences, p.isolatingRunSequence(currentRunSequence)) 809 } 810 } 811 return sequences 812 } 813 814 // Assign level information to characters removed by rule X9. This is for 815 // ease of relating the level information to the original input data. Note 816 // that the levels assigned to these codes are arbitrary, they're chosen so 817 // as to avoid breaking level runs. 818 func (p *paragraph) assignLevelsToCharactersRemovedByX9() { 819 for i, t := range p.initialTypes { 820 if t.in(LRE, RLE, LRO, RLO, PDF, BN) { 821 p.resultTypes[i] = t 822 p.resultLevels[i] = -1 823 } 824 } 825 // now propagate forward the levels information (could have 826 // propagated backward, the main thing is not to introduce a level 827 // break where one doesn't already exist). 828 829 if p.resultLevels[0] == -1 { 830 p.resultLevels[0] = p.embeddingLevel 831 } 832 for i := 1; i < len(p.initialTypes); i++ { 833 if p.resultLevels[i] == -1 { 834 p.resultLevels[i] = p.resultLevels[i-1] 835 } 836 } 837 // Embedding information is for informational purposes only so need not be 838 // adjusted. 839 } 840 841 // 842 // Output 843 // 844 845 // getLevels computes levels array breaking lines at offsets in linebreaks. 846 // Rule L1. 847 // 848 // The linebreaks array must include at least one value. The values must be 849 // in strictly increasing order (no duplicates) between 1 and the length of 850 // the text, inclusive. The last value must be the length of the text. 851 func (p *paragraph) getLevels(linebreaks []int) []level { 852 // Note that since the previous processing has removed all 853 // P, S, and WS values from resultTypes, the values referred to 854 // in these rules are the initial types, before any processing 855 // has been applied (including processing of overrides). 856 // 857 // This example implementation has reinserted explicit format codes 858 // and BN, in order that the levels array correspond to the 859 // initial text. Their final placement is not normative. 860 // These codes are treated like WS in this implementation, 861 // so they don't interrupt sequences of WS. 862 863 validateLineBreaks(linebreaks, p.Len()) 864 865 result := append([]level(nil), p.resultLevels...) 866 867 // don't worry about linebreaks since if there is a break within 868 // a series of WS values preceding S, the linebreak itself 869 // causes the reset. 870 for i, t := range p.initialTypes { 871 if t.in(B, S) { 872 // Rule L1, clauses one and two. 873 result[i] = p.embeddingLevel 874 875 // Rule L1, clause three. 876 for j := i - 1; j >= 0; j-- { 877 if isWhitespace(p.initialTypes[j]) { // including format codes 878 result[j] = p.embeddingLevel 879 } else { 880 break 881 } 882 } 883 } 884 } 885 886 // Rule L1, clause four. 887 start := 0 888 for _, limit := range linebreaks { 889 for j := limit - 1; j >= start; j-- { 890 if isWhitespace(p.initialTypes[j]) { // including format codes 891 result[j] = p.embeddingLevel 892 } else { 893 break 894 } 895 } 896 start = limit 897 } 898 899 return result 900 } 901 902 // getReordering returns the reordering of lines from a visual index to a 903 // logical index for line breaks at the given offsets. 904 // 905 // Lines are concatenated from left to right. So for example, the fifth 906 // character from the left on the third line is 907 // 908 // getReordering(linebreaks)[linebreaks[1] + 4] 909 // 910 // (linebreaks[1] is the position after the last character of the second 911 // line, which is also the index of the first character on the third line, 912 // and adding four gets the fifth character from the left). 913 // 914 // The linebreaks array must include at least one value. The values must be 915 // in strictly increasing order (no duplicates) between 1 and the length of 916 // the text, inclusive. The last value must be the length of the text. 917 func (p *paragraph) getReordering(linebreaks []int) []int { 918 validateLineBreaks(linebreaks, p.Len()) 919 920 return computeMultilineReordering(p.getLevels(linebreaks), linebreaks) 921 } 922 923 // Return multiline reordering array for a given level array. Reordering 924 // does not occur across a line break. 925 func computeMultilineReordering(levels []level, linebreaks []int) []int { 926 result := make([]int, len(levels)) 927 928 start := 0 929 for _, limit := range linebreaks { 930 tempLevels := make([]level, limit-start) 931 copy(tempLevels, levels[start:]) 932 933 for j, order := range computeReordering(tempLevels) { 934 result[start+j] = order + start 935 } 936 start = limit 937 } 938 return result 939 } 940 941 // Return reordering array for a given level array. This reorders a single 942 // line. The reordering is a visual to logical map. For example, the 943 // leftmost char is string.charAt(order[0]). Rule L2. 944 func computeReordering(levels []level) []int { 945 result := make([]int, len(levels)) 946 // initialize order 947 for i := range result { 948 result[i] = i 949 } 950 951 // locate highest level found on line. 952 // Note the rules say text, but no reordering across line bounds is 953 // performed, so this is sufficient. 954 highestLevel := level(0) 955 lowestOddLevel := level(maxDepth + 2) 956 for _, level := range levels { 957 if level > highestLevel { 958 highestLevel = level 959 } 960 if level&1 != 0 && level < lowestOddLevel { 961 lowestOddLevel = level 962 } 963 } 964 965 for level := highestLevel; level >= lowestOddLevel; level-- { 966 for i := 0; i < len(levels); i++ { 967 if levels[i] >= level { 968 // find range of text at or above this level 969 start := i 970 limit := i + 1 971 for limit < len(levels) && levels[limit] >= level { 972 limit++ 973 } 974 975 for j, k := start, limit-1; j < k; j, k = j+1, k-1 { 976 result[j], result[k] = result[k], result[j] 977 } 978 // skip to end of level run 979 i = limit 980 } 981 } 982 } 983 984 return result 985 } 986 987 // isWhitespace reports whether the type is considered a whitespace type for the 988 // line break rules. 989 func isWhitespace(c Class) bool { 990 switch c { 991 case LRE, RLE, LRO, RLO, PDF, LRI, RLI, FSI, PDI, BN, WS: 992 return true 993 } 994 return false 995 } 996 997 // isRemovedByX9 reports whether the type is one of the types removed in X9. 998 func isRemovedByX9(c Class) bool { 999 switch c { 1000 case LRE, RLE, LRO, RLO, PDF, BN: 1001 return true 1002 } 1003 return false 1004 } 1005 1006 // typeForLevel reports the strong type (L or R) corresponding to the level. 1007 func typeForLevel(level level) Class { 1008 if (level & 0x1) == 0 { 1009 return L 1010 } 1011 return R 1012 } 1013 1014 func validateTypes(types []Class) error { 1015 if len(types) == 0 { 1016 return fmt.Errorf("types is null") 1017 } 1018 for i, t := range types[:len(types)-1] { 1019 if t == B { 1020 return fmt.Errorf("B type before end of paragraph at index: %d", i) 1021 } 1022 } 1023 return nil 1024 } 1025 1026 func validateParagraphEmbeddingLevel(embeddingLevel level) error { 1027 if embeddingLevel != implicitLevel && 1028 embeddingLevel != 0 && 1029 embeddingLevel != 1 { 1030 return fmt.Errorf("illegal paragraph embedding level: %d", embeddingLevel) 1031 } 1032 return nil 1033 } 1034 1035 func validateLineBreaks(linebreaks []int, textLength int) error { 1036 prev := 0 1037 for i, next := range linebreaks { 1038 if next <= prev { 1039 return fmt.Errorf("bad linebreak: %d at index: %d", next, i) 1040 } 1041 prev = next 1042 } 1043 if prev != textLength { 1044 return fmt.Errorf("last linebreak was %d, want %d", prev, textLength) 1045 } 1046 return nil 1047 } 1048 1049 func validatePbTypes(pairTypes []bracketType) error { 1050 if len(pairTypes) == 0 { 1051 return fmt.Errorf("pairTypes is null") 1052 } 1053 for i, pt := range pairTypes { 1054 switch pt { 1055 case bpNone, bpOpen, bpClose: 1056 default: 1057 return fmt.Errorf("illegal pairType value at %d: %v", i, pairTypes[i]) 1058 } 1059 } 1060 return nil 1061 } 1062 1063 func validatePbValues(pairValues []rune, pairTypes []bracketType) error { 1064 if pairValues == nil { 1065 return fmt.Errorf("pairValues is null") 1066 } 1067 if len(pairTypes) != len(pairValues) { 1068 return fmt.Errorf("pairTypes is different length from pairValues") 1069 } 1070 return nil 1071 }