github.com/liquid-dev/text@v0.3.3-liquid/internal/colltab/collelem.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package colltab
     6  
     7  import (
     8  	"fmt"
     9  	"unicode"
    10  )
    11  
    12  // Level identifies the collation comparison level.
    13  // The primary level corresponds to the basic sorting of text.
    14  // The secondary level corresponds to accents and related linguistic elements.
    15  // The tertiary level corresponds to casing and related concepts.
    16  // The quaternary level is derived from the other levels by the
    17  // various algorithms for handling variable elements.
    18  type Level int
    19  
    20  const (
    21  	Primary Level = iota
    22  	Secondary
    23  	Tertiary
    24  	Quaternary
    25  	Identity
    26  
    27  	NumLevels
    28  )
    29  
    30  const (
    31  	defaultSecondary = 0x20
    32  	defaultTertiary  = 0x2
    33  	maxTertiary      = 0x1F
    34  	MaxQuaternary    = 0x1FFFFF // 21 bits.
    35  )
    36  
    37  // Elem is a representation of a collation element. This API provides ways to encode
    38  // and decode Elems. Implementations of collation tables may use values greater
    39  // or equal to PrivateUse for their own purposes.  However, these should never be
    40  // returned by AppendNext.
    41  type Elem uint32
    42  
    43  const (
    44  	maxCE       Elem = 0xAFFFFFFF
    45  	PrivateUse       = minContract
    46  	minContract      = 0xC0000000
    47  	maxContract      = 0xDFFFFFFF
    48  	minExpand        = 0xE0000000
    49  	maxExpand        = 0xEFFFFFFF
    50  	minDecomp        = 0xF0000000
    51  )
    52  
    53  type ceType int
    54  
    55  const (
    56  	ceNormal           ceType = iota // ceNormal includes implicits (ce == 0)
    57  	ceContractionIndex               // rune can be a start of a contraction
    58  	ceExpansionIndex                 // rune expands into a sequence of collation elements
    59  	ceDecompose                      // rune expands using NFKC decomposition
    60  )
    61  
    62  func (ce Elem) ctype() ceType {
    63  	if ce <= maxCE {
    64  		return ceNormal
    65  	}
    66  	if ce <= maxContract {
    67  		return ceContractionIndex
    68  	} else {
    69  		if ce <= maxExpand {
    70  			return ceExpansionIndex
    71  		}
    72  		return ceDecompose
    73  	}
    74  	panic("should not reach here")
    75  	return ceType(-1)
    76  }
    77  
    78  // For normal collation elements, we assume that a collation element either has
    79  // a primary or non-default secondary value, not both.
    80  // Collation elements with a primary value are of the form
    81  // 01pppppp pppppppp ppppppp0 ssssssss
    82  //   - p* is primary collation value
    83  //   - s* is the secondary collation value
    84  // 00pppppp pppppppp ppppppps sssttttt, where
    85  //   - p* is primary collation value
    86  //   - s* offset of secondary from default value.
    87  //   - t* is the tertiary collation value
    88  // 100ttttt cccccccc pppppppp pppppppp
    89  //   - t* is the tertiar collation value
    90  //   - c* is the canonical combining class
    91  //   - p* is the primary collation value
    92  // Collation elements with a secondary value are of the form
    93  // 1010cccc ccccssss ssssssss tttttttt, where
    94  //   - c* is the canonical combining class
    95  //   - s* is the secondary collation value
    96  //   - t* is the tertiary collation value
    97  // 11qqqqqq qqqqqqqq qqqqqqq0 00000000
    98  //   - q* quaternary value
    99  const (
   100  	ceTypeMask              = 0xC0000000
   101  	ceTypeMaskExt           = 0xE0000000
   102  	ceIgnoreMask            = 0xF00FFFFF
   103  	ceType1                 = 0x40000000
   104  	ceType2                 = 0x00000000
   105  	ceType3or4              = 0x80000000
   106  	ceType4                 = 0xA0000000
   107  	ceTypeQ                 = 0xC0000000
   108  	Ignore                  = ceType4
   109  	firstNonPrimary         = 0x80000000
   110  	lastSpecialPrimary      = 0xA0000000
   111  	secondaryMask           = 0x80000000
   112  	hasTertiaryMask         = 0x40000000
   113  	primaryValueMask        = 0x3FFFFE00
   114  	maxPrimaryBits          = 21
   115  	compactPrimaryBits      = 16
   116  	maxSecondaryBits        = 12
   117  	maxTertiaryBits         = 8
   118  	maxCCCBits              = 8
   119  	maxSecondaryCompactBits = 8
   120  	maxSecondaryDiffBits    = 4
   121  	maxTertiaryCompactBits  = 5
   122  	primaryShift            = 9
   123  	compactSecondaryShift   = 5
   124  	minCompactSecondary     = defaultSecondary - 4
   125  )
   126  
   127  func makeImplicitCE(primary int) Elem {
   128  	return ceType1 | Elem(primary<<primaryShift) | defaultSecondary
   129  }
   130  
   131  // MakeElem returns an Elem for the given values.  It will return an error
   132  // if the given combination of values is invalid.
   133  func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) {
   134  	if w := primary; w >= 1<<maxPrimaryBits || w < 0 {
   135  		return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
   136  	}
   137  	if w := secondary; w >= 1<<maxSecondaryBits || w < 0 {
   138  		return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
   139  	}
   140  	if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 {
   141  		return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits)
   142  	}
   143  	ce := Elem(0)
   144  	if primary != 0 {
   145  		if ccc != 0 {
   146  			if primary >= 1<<compactPrimaryBits {
   147  				return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits)
   148  			}
   149  			if secondary != defaultSecondary {
   150  				return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc)
   151  			}
   152  			ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits))
   153  			ce |= Elem(ccc) << compactPrimaryBits
   154  			ce |= Elem(primary)
   155  			ce |= ceType3or4
   156  		} else if tertiary == defaultTertiary {
   157  			if secondary >= 1<<maxSecondaryCompactBits {
   158  				return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits)
   159  			}
   160  			ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary)
   161  			ce |= ceType1
   162  		} else {
   163  			d := secondary - defaultSecondary + maxSecondaryDiffBits
   164  			if d >= 1<<maxSecondaryDiffBits || d < 0 {
   165  				return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
   166  			}
   167  			if tertiary >= 1<<maxTertiaryCompactBits {
   168  				return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits)
   169  			}
   170  			ce = Elem(primary<<maxSecondaryDiffBits + d)
   171  			ce = ce<<maxTertiaryCompactBits + Elem(tertiary)
   172  		}
   173  	} else {
   174  		ce = Elem(secondary<<maxTertiaryBits + tertiary)
   175  		ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits)
   176  		ce |= ceType4
   177  	}
   178  	return ce, nil
   179  }
   180  
   181  // MakeQuaternary returns an Elem with the given quaternary value.
   182  func MakeQuaternary(v int) Elem {
   183  	return ceTypeQ | Elem(v<<primaryShift)
   184  }
   185  
   186  // Mask sets weights for any level smaller than l to 0.
   187  // The resulting Elem can be used to test for equality with
   188  // other Elems to which the same mask has been applied.
   189  func (ce Elem) Mask(l Level) uint32 {
   190  	return 0
   191  }
   192  
   193  // CCC returns the canonical combining class associated with the underlying character,
   194  // if applicable, or 0 otherwise.
   195  func (ce Elem) CCC() uint8 {
   196  	if ce&ceType3or4 != 0 {
   197  		if ce&ceType4 == ceType3or4 {
   198  			return uint8(ce >> 16)
   199  		}
   200  		return uint8(ce >> 20)
   201  	}
   202  	return 0
   203  }
   204  
   205  // Primary returns the primary collation weight for ce.
   206  func (ce Elem) Primary() int {
   207  	if ce >= firstNonPrimary {
   208  		if ce > lastSpecialPrimary {
   209  			return 0
   210  		}
   211  		return int(uint16(ce))
   212  	}
   213  	return int(ce&primaryValueMask) >> primaryShift
   214  }
   215  
   216  // Secondary returns the secondary collation weight for ce.
   217  func (ce Elem) Secondary() int {
   218  	switch ce & ceTypeMask {
   219  	case ceType1:
   220  		return int(uint8(ce))
   221  	case ceType2:
   222  		return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
   223  	case ceType3or4:
   224  		if ce < ceType4 {
   225  			return defaultSecondary
   226  		}
   227  		return int(ce>>8) & 0xFFF
   228  	case ceTypeQ:
   229  		return 0
   230  	}
   231  	panic("should not reach here")
   232  }
   233  
   234  // Tertiary returns the tertiary collation weight for ce.
   235  func (ce Elem) Tertiary() uint8 {
   236  	if ce&hasTertiaryMask == 0 {
   237  		if ce&ceType3or4 == 0 {
   238  			return uint8(ce & 0x1F)
   239  		}
   240  		if ce&ceType4 == ceType4 {
   241  			return uint8(ce)
   242  		}
   243  		return uint8(ce>>24) & 0x1F // type 2
   244  	} else if ce&ceTypeMask == ceType1 {
   245  		return defaultTertiary
   246  	}
   247  	// ce is a quaternary value.
   248  	return 0
   249  }
   250  
   251  func (ce Elem) updateTertiary(t uint8) Elem {
   252  	if ce&ceTypeMask == ceType1 {
   253  		// convert to type 4
   254  		nce := ce & primaryValueMask
   255  		nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
   256  		ce = nce
   257  	} else if ce&ceTypeMaskExt == ceType3or4 {
   258  		ce &= ^Elem(maxTertiary << 24)
   259  		return ce | (Elem(t) << 24)
   260  	} else {
   261  		// type 2 or 4
   262  		ce &= ^Elem(maxTertiary)
   263  	}
   264  	return ce | Elem(t)
   265  }
   266  
   267  // Quaternary returns the quaternary value if explicitly specified,
   268  // 0 if ce == Ignore, or MaxQuaternary otherwise.
   269  // Quaternary values are used only for shifted variants.
   270  func (ce Elem) Quaternary() int {
   271  	if ce&ceTypeMask == ceTypeQ {
   272  		return int(ce&primaryValueMask) >> primaryShift
   273  	} else if ce&ceIgnoreMask == Ignore {
   274  		return 0
   275  	}
   276  	return MaxQuaternary
   277  }
   278  
   279  // Weight returns the collation weight for the given level.
   280  func (ce Elem) Weight(l Level) int {
   281  	switch l {
   282  	case Primary:
   283  		return ce.Primary()
   284  	case Secondary:
   285  		return ce.Secondary()
   286  	case Tertiary:
   287  		return int(ce.Tertiary())
   288  	case Quaternary:
   289  		return ce.Quaternary()
   290  	}
   291  	return 0 // return 0 (ignore) for undefined levels.
   292  }
   293  
   294  // For contractions, collation elements are of the form
   295  // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
   296  //   - n* is the size of the first node in the contraction trie.
   297  //   - i* is the index of the first node in the contraction trie.
   298  //   - b* is the offset into the contraction collation element table.
   299  // See contract.go for details on the contraction trie.
   300  const (
   301  	maxNBits              = 4
   302  	maxTrieIndexBits      = 12
   303  	maxContractOffsetBits = 13
   304  )
   305  
   306  func splitContractIndex(ce Elem) (index, n, offset int) {
   307  	n = int(ce & (1<<maxNBits - 1))
   308  	ce >>= maxNBits
   309  	index = int(ce & (1<<maxTrieIndexBits - 1))
   310  	ce >>= maxTrieIndexBits
   311  	offset = int(ce & (1<<maxContractOffsetBits - 1))
   312  	return
   313  }
   314  
   315  // For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
   316  // where b* is the index into the expansion sequence table.
   317  const maxExpandIndexBits = 16
   318  
   319  func splitExpandIndex(ce Elem) (index int) {
   320  	return int(uint16(ce))
   321  }
   322  
   323  // Some runes can be expanded using NFKD decomposition. Instead of storing the full
   324  // sequence of collation elements, we decompose the rune and lookup the collation
   325  // elements for each rune in the decomposition and modify the tertiary weights.
   326  // The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
   327  //   - v* is the replacement tertiary weight for the first rune,
   328  //   - w* is the replacement tertiary weight for the second rune,
   329  // Tertiary weights of subsequent runes should be replaced with maxTertiary.
   330  // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
   331  func splitDecompose(ce Elem) (t1, t2 uint8) {
   332  	return uint8(ce), uint8(ce >> 8)
   333  }
   334  
   335  const (
   336  	// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
   337  	minUnified       rune = 0x4E00
   338  	maxUnified            = 0x9FFF
   339  	minCompatibility      = 0xF900
   340  	maxCompatibility      = 0xFAFF
   341  	minRare               = 0x3400
   342  	maxRare               = 0x4DBF
   343  )
   344  const (
   345  	commonUnifiedOffset = 0x10000
   346  	rareUnifiedOffset   = 0x20000 // largest rune in common is U+FAFF
   347  	otherOffset         = 0x50000 // largest rune in rare is U+2FA1D
   348  	illegalOffset       = otherOffset + int(unicode.MaxRune)
   349  	maxPrimary          = illegalOffset + 1
   350  )
   351  
   352  // implicitPrimary returns the primary weight for the a rune
   353  // for which there is no entry for the rune in the collation table.
   354  // We take a different approach from the one specified in
   355  // https://unicode.org/reports/tr10/#Implicit_Weights,
   356  // but preserve the resulting relative ordering of the runes.
   357  func implicitPrimary(r rune) int {
   358  	if unicode.Is(unicode.Ideographic, r) {
   359  		if r >= minUnified && r <= maxUnified {
   360  			// The most common case for CJK.
   361  			return int(r) + commonUnifiedOffset
   362  		}
   363  		if r >= minCompatibility && r <= maxCompatibility {
   364  			// This will typically not hit. The DUCET explicitly specifies mappings
   365  			// for all characters that do not decompose.
   366  			return int(r) + commonUnifiedOffset
   367  		}
   368  		return int(r) + rareUnifiedOffset
   369  	}
   370  	return int(r) + otherOffset
   371  }