github.com/liquid-dev/text@v0.3.3-liquid/collate/build/colelem.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package build
     6  
     7  import (
     8  	"fmt"
     9  	"unicode"
    10  
    11  	"github.com/liquid-dev/text/internal/colltab"
    12  )
    13  
    14  const (
    15  	defaultSecondary = 0x20
    16  	defaultTertiary  = 0x2
    17  	maxTertiary      = 0x1F
    18  )
    19  
    20  type rawCE struct {
    21  	w   []int
    22  	ccc uint8
    23  }
    24  
    25  func makeRawCE(w []int, ccc uint8) rawCE {
    26  	ce := rawCE{w: make([]int, 4), ccc: ccc}
    27  	copy(ce.w, w)
    28  	return ce
    29  }
    30  
    31  // A collation element is represented as an uint32.
    32  // In the typical case, a rune maps to a single collation element. If a rune
    33  // can be the start of a contraction or expands into multiple collation elements,
    34  // then the collation element that is associated with a rune will have a special
    35  // form to represent such m to n mappings.  Such special collation elements
    36  // have a value >= 0x80000000.
    37  
    38  const (
    39  	maxPrimaryBits   = 21
    40  	maxSecondaryBits = 12
    41  	maxTertiaryBits  = 8
    42  )
    43  
    44  func makeCE(ce rawCE) (uint32, error) {
    45  	v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
    46  	return uint32(v), e
    47  }
    48  
    49  // For contractions, collation elements are of the form
    50  // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
    51  //   - n* is the size of the first node in the contraction trie.
    52  //   - i* is the index of the first node in the contraction trie.
    53  //   - b* is the offset into the contraction collation element table.
    54  // See contract.go for details on the contraction trie.
    55  const (
    56  	contractID            = 0xC0000000
    57  	maxNBits              = 4
    58  	maxTrieIndexBits      = 12
    59  	maxContractOffsetBits = 13
    60  )
    61  
    62  func makeContractIndex(h ctHandle, offset int) (uint32, error) {
    63  	if h.n >= 1<<maxNBits {
    64  		return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
    65  	}
    66  	if h.index >= 1<<maxTrieIndexBits {
    67  		return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
    68  	}
    69  	if offset >= 1<<maxContractOffsetBits {
    70  		return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
    71  	}
    72  	ce := uint32(contractID)
    73  	ce += uint32(offset << (maxNBits + maxTrieIndexBits))
    74  	ce += uint32(h.index << maxNBits)
    75  	ce += uint32(h.n)
    76  	return ce, nil
    77  }
    78  
    79  // For expansions, collation elements are of the form
    80  // 11100000 00000000 bbbbbbbb bbbbbbbb,
    81  // where b* is the index into the expansion sequence table.
    82  const (
    83  	expandID           = 0xE0000000
    84  	maxExpandIndexBits = 16
    85  )
    86  
    87  func makeExpandIndex(index int) (uint32, error) {
    88  	if index >= 1<<maxExpandIndexBits {
    89  		return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
    90  	}
    91  	return expandID + uint32(index), nil
    92  }
    93  
    94  // Each list of collation elements corresponding to an expansion starts with
    95  // a header indicating the length of the sequence.
    96  func makeExpansionHeader(n int) (uint32, error) {
    97  	return uint32(n), nil
    98  }
    99  
   100  // Some runes can be expanded using NFKD decomposition. Instead of storing the full
   101  // sequence of collation elements, we decompose the rune and lookup the collation
   102  // elements for each rune in the decomposition and modify the tertiary weights.
   103  // The collation element, in this case, is of the form
   104  // 11110000 00000000 wwwwwwww vvvvvvvv, where
   105  //   - v* is the replacement tertiary weight for the first rune,
   106  //   - w* is the replacement tertiary weight for the second rune,
   107  // Tertiary weights of subsequent runes should be replaced with maxTertiary.
   108  // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
   109  const (
   110  	decompID = 0xF0000000
   111  )
   112  
   113  func makeDecompose(t1, t2 int) (uint32, error) {
   114  	if t1 >= 256 || t1 < 0 {
   115  		return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
   116  	}
   117  	if t2 >= 256 || t2 < 0 {
   118  		return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
   119  	}
   120  	return uint32(t2<<8+t1) + decompID, nil
   121  }
   122  
   123  const (
   124  	// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
   125  	minUnified       rune = 0x4E00
   126  	maxUnified            = 0x9FFF
   127  	minCompatibility      = 0xF900
   128  	maxCompatibility      = 0xFAFF
   129  	minRare               = 0x3400
   130  	maxRare               = 0x4DBF
   131  )
   132  const (
   133  	commonUnifiedOffset = 0x10000
   134  	rareUnifiedOffset   = 0x20000 // largest rune in common is U+FAFF
   135  	otherOffset         = 0x50000 // largest rune in rare is U+2FA1D
   136  	illegalOffset       = otherOffset + int(unicode.MaxRune)
   137  	maxPrimary          = illegalOffset + 1
   138  )
   139  
   140  // implicitPrimary returns the primary weight for the a rune
   141  // for which there is no entry for the rune in the collation table.
   142  // We take a different approach from the one specified in
   143  // https://unicode.org/reports/tr10/#Implicit_Weights,
   144  // but preserve the resulting relative ordering of the runes.
   145  func implicitPrimary(r rune) int {
   146  	if unicode.Is(unicode.Ideographic, r) {
   147  		if r >= minUnified && r <= maxUnified {
   148  			// The most common case for CJK.
   149  			return int(r) + commonUnifiedOffset
   150  		}
   151  		if r >= minCompatibility && r <= maxCompatibility {
   152  			// This will typically not hit. The DUCET explicitly specifies mappings
   153  			// for all characters that do not decompose.
   154  			return int(r) + commonUnifiedOffset
   155  		}
   156  		return int(r) + rareUnifiedOffset
   157  	}
   158  	return int(r) + otherOffset
   159  }
   160  
   161  // convertLargeWeights converts collation elements with large
   162  // primaries (either double primaries or for illegal runes)
   163  // to our own representation.
   164  // A CJK character C is represented in the DUCET as
   165  //   [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
   166  // We will rewrite these characters to a single CE.
   167  // We assume the CJK values start at 0x8000.
   168  // See https://unicode.org/reports/tr10/#Implicit_Weights
   169  func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
   170  	const (
   171  		cjkPrimaryStart   = 0xFB40
   172  		rarePrimaryStart  = 0xFB80
   173  		otherPrimaryStart = 0xFBC0
   174  		illegalPrimary    = 0xFFFE
   175  		highBitsMask      = 0x3F
   176  		lowBitsMask       = 0x7FFF
   177  		lowBitsFlag       = 0x8000
   178  		shiftBits         = 15
   179  	)
   180  	for i := 0; i < len(elems); i++ {
   181  		ce := elems[i].w
   182  		p := ce[0]
   183  		if p < cjkPrimaryStart {
   184  			continue
   185  		}
   186  		if p > 0xFFFF {
   187  			return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
   188  		}
   189  		if p >= illegalPrimary {
   190  			ce[0] = illegalOffset + p - illegalPrimary
   191  		} else {
   192  			if i+1 >= len(elems) {
   193  				return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
   194  			}
   195  			if elems[i+1].w[0]&lowBitsFlag == 0 {
   196  				return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
   197  			}
   198  			np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
   199  			switch {
   200  			case p < rarePrimaryStart:
   201  				np += commonUnifiedOffset
   202  			case p < otherPrimaryStart:
   203  				np += rareUnifiedOffset
   204  			default:
   205  				p += otherOffset
   206  			}
   207  			ce[0] = np
   208  			for j := i + 1; j+1 < len(elems); j++ {
   209  				elems[j] = elems[j+1]
   210  			}
   211  			elems = elems[:len(elems)-1]
   212  		}
   213  	}
   214  	return elems, nil
   215  }
   216  
   217  // nextWeight computes the first possible collation weights following elems
   218  // for the given level.
   219  func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
   220  	if level == colltab.Identity {
   221  		next := make([]rawCE, len(elems))
   222  		copy(next, elems)
   223  		return next
   224  	}
   225  	next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
   226  	next[0].w[level]++
   227  	if level < colltab.Secondary {
   228  		next[0].w[colltab.Secondary] = defaultSecondary
   229  	}
   230  	if level < colltab.Tertiary {
   231  		next[0].w[colltab.Tertiary] = defaultTertiary
   232  	}
   233  	// Filter entries that cannot influence ordering.
   234  	for _, ce := range elems[1:] {
   235  		skip := true
   236  		for i := colltab.Primary; i < level; i++ {
   237  			skip = skip && ce.w[i] == 0
   238  		}
   239  		if !skip {
   240  			next = append(next, ce)
   241  		}
   242  	}
   243  	return next
   244  }
   245  
   246  func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
   247  	for ; i < len(elems) && elems[i].w[level] == 0; i++ {
   248  	}
   249  	if i < len(elems) {
   250  		return i, elems[i].w[level]
   251  	}
   252  	return i, 0
   253  }
   254  
   255  // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
   256  // It also returns the collation level at which the difference is found.
   257  func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
   258  	for level := colltab.Primary; level < colltab.Identity; level++ {
   259  		var va, vb int
   260  		for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
   261  			ia, va = nextVal(a, ia, level)
   262  			ib, vb = nextVal(b, ib, level)
   263  			if va != vb {
   264  				if va < vb {
   265  					return -1, level
   266  				} else {
   267  					return 1, level
   268  				}
   269  			}
   270  		}
   271  	}
   272  	return 0, colltab.Identity
   273  }
   274  
   275  func equalCE(a, b rawCE) bool {
   276  	for i := 0; i < 3; i++ {
   277  		if b.w[i] != a.w[i] {
   278  			return false
   279  		}
   280  	}
   281  	return true
   282  }
   283  
   284  func equalCEArrays(a, b []rawCE) bool {
   285  	if len(a) != len(b) {
   286  		return false
   287  	}
   288  	for i := range a {
   289  		if !equalCE(a[i], b[i]) {
   290  			return false
   291  		}
   292  	}
   293  	return true
   294  }