github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/internal/colltab/numeric.go (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package colltab
     6  
     7  import (
     8  	"unicode"
     9  	"unicode/utf8"
    10  )
    11  
    12  // NewNumericWeighter wraps w to replace individual digits to sort based on their
    13  // numeric value.
    14  //
    15  // Weighter w must have a free primary weight after the primary weight for 9.
    16  // If this is not the case, numeric value will sort at the same primary level
    17  // as the first primary sorting after 9.
    18  func NewNumericWeighter(w Weighter) Weighter {
    19  	getElem := func(s string) Elem {
    20  		elems, _ := w.AppendNextString(nil, s)
    21  		return elems[0]
    22  	}
    23  	nine := getElem("9")
    24  
    25  	// Numbers should order before zero, but the DUCET has no room for this.
    26  	// TODO: move before zero once we use fractional collation elements.
    27  	ns, _ := MakeElem(nine.Primary()+1, nine.Secondary(), int(nine.Tertiary()), 0)
    28  
    29  	return &numericWeighter{
    30  		Weighter: w,
    31  
    32  		// We assume that w sorts digits of different kinds in order of numeric
    33  		// value and that the tertiary weight order is preserved.
    34  		//
    35  		// TODO: evaluate whether it is worth basing the ranges on the Elem
    36  		// encoding itself once the move to fractional weights is complete.
    37  		zero:          getElem("0"),
    38  		zeroSpecialLo: getElem("0"), // U+FF10 FULLWIDTH DIGIT ZERO
    39  		zeroSpecialHi: getElem("₀"), // U+2080 SUBSCRIPT ZERO
    40  		nine:          nine,
    41  		nineSpecialHi: getElem("₉"), // U+2089 SUBSCRIPT NINE
    42  		numberStart:   ns,
    43  	}
    44  }
    45  
    46  // A numericWeighter translates a stream of digits into a stream of weights
    47  // representing the numeric value.
    48  type numericWeighter struct {
    49  	Weighter
    50  
    51  	// The Elems below all demarcate boundaries of specific ranges. With the
    52  	// current element encoding digits are in two ranges: normal (default
    53  	// tertiary value) and special. For most languages, digits have collation
    54  	// elements in the normal range.
    55  	//
    56  	// Note: the range tests are very specific for the element encoding used by
    57  	// this implementation. The tests in collate_test.go are designed to fail
    58  	// if this code is not updated when an encoding has changed.
    59  
    60  	zero          Elem // normal digit zero
    61  	zeroSpecialLo Elem // special digit zero, low tertiary value
    62  	zeroSpecialHi Elem // special digit zero, high tertiary value
    63  	nine          Elem // normal digit nine
    64  	nineSpecialHi Elem // special digit nine
    65  	numberStart   Elem
    66  }
    67  
    68  // AppendNext calls the namesake of the underlying weigher, but replaces single
    69  // digits with weights representing their value.
    70  func (nw *numericWeighter) AppendNext(buf []Elem, s []byte) (ce []Elem, n int) {
    71  	ce, n = nw.Weighter.AppendNext(buf, s)
    72  	nc := numberConverter{
    73  		elems: buf,
    74  		w:     nw,
    75  		b:     s,
    76  	}
    77  	isZero, ok := nc.checkNextDigit(ce)
    78  	if !ok {
    79  		return ce, n
    80  	}
    81  	// ce might have been grown already, so take it instead of buf.
    82  	nc.init(ce, len(buf), isZero)
    83  	for n < len(s) {
    84  		ce, sz := nw.Weighter.AppendNext(nc.elems, s[n:])
    85  		nc.b = s
    86  		n += sz
    87  		if !nc.update(ce) {
    88  			break
    89  		}
    90  	}
    91  	return nc.result(), n
    92  }
    93  
    94  // AppendNextString calls the namesake of the underlying weigher, but replaces
    95  // single digits with weights representing their value.
    96  func (nw *numericWeighter) AppendNextString(buf []Elem, s string) (ce []Elem, n int) {
    97  	ce, n = nw.Weighter.AppendNextString(buf, s)
    98  	nc := numberConverter{
    99  		elems: buf,
   100  		w:     nw,
   101  		s:     s,
   102  	}
   103  	isZero, ok := nc.checkNextDigit(ce)
   104  	if !ok {
   105  		return ce, n
   106  	}
   107  	nc.init(ce, len(buf), isZero)
   108  	for n < len(s) {
   109  		ce, sz := nw.Weighter.AppendNextString(nc.elems, s[n:])
   110  		nc.s = s
   111  		n += sz
   112  		if !nc.update(ce) {
   113  			break
   114  		}
   115  	}
   116  	return nc.result(), n
   117  }
   118  
   119  type numberConverter struct {
   120  	w *numericWeighter
   121  
   122  	elems    []Elem
   123  	nDigits  int
   124  	lenIndex int
   125  
   126  	s string // set if the input was of type string
   127  	b []byte // set if the input was of type []byte
   128  }
   129  
   130  // init completes initialization of a numberConverter and prepares it for adding
   131  // more digits. elems is assumed to have a digit starting at oldLen.
   132  func (nc *numberConverter) init(elems []Elem, oldLen int, isZero bool) {
   133  	// Insert a marker indicating the start of a number and a placeholder
   134  	// for the number of digits.
   135  	if isZero {
   136  		elems = append(elems[:oldLen], nc.w.numberStart, 0)
   137  	} else {
   138  		elems = append(elems, 0, 0)
   139  		copy(elems[oldLen+2:], elems[oldLen:])
   140  		elems[oldLen] = nc.w.numberStart
   141  		elems[oldLen+1] = 0
   142  
   143  		nc.nDigits = 1
   144  	}
   145  	nc.elems = elems
   146  	nc.lenIndex = oldLen + 1
   147  }
   148  
   149  // checkNextDigit reports whether bufNew adds a single digit relative to the old
   150  // buffer. If it does, it also reports whether this digit is zero.
   151  func (nc *numberConverter) checkNextDigit(bufNew []Elem) (isZero, ok bool) {
   152  	if len(nc.elems) >= len(bufNew) {
   153  		return false, false
   154  	}
   155  	e := bufNew[len(nc.elems)]
   156  	if e < nc.w.zeroSpecialLo || nc.w.nine < e {
   157  		// Not a number.
   158  		return false, false
   159  	}
   160  	if e < nc.w.zero {
   161  		if e > nc.w.nineSpecialHi {
   162  			// Not a number.
   163  			return false, false
   164  		}
   165  		if !nc.isDigit() {
   166  			return false, false
   167  		}
   168  		isZero = e <= nc.w.zeroSpecialHi
   169  	} else {
   170  		// This is the common case if we encounter a digit.
   171  		isZero = e == nc.w.zero
   172  	}
   173  	// Test the remaining added collation elements have a zero primary value.
   174  	if n := len(bufNew) - len(nc.elems); n > 1 {
   175  		for i := len(nc.elems) + 1; i < len(bufNew); i++ {
   176  			if bufNew[i].Primary() != 0 {
   177  				return false, false
   178  			}
   179  		}
   180  		// In some rare cases, collation elements will encode runes in
   181  		// unicode.No as a digit. For example Ethiopic digits (U+1369 - U+1371)
   182  		// are not in Nd. Also some digits that clearly belong in unicode.No,
   183  		// like U+0C78 TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR, have
   184  		// collation elements indistinguishable from normal digits.
   185  		// Unfortunately, this means we need to make this check for nearly all
   186  		// non-Latin digits.
   187  		//
   188  		// TODO: check the performance impact and find something better if it is
   189  		// an issue.
   190  		if !nc.isDigit() {
   191  			return false, false
   192  		}
   193  	}
   194  	return isZero, true
   195  }
   196  
   197  func (nc *numberConverter) isDigit() bool {
   198  	if nc.b != nil {
   199  		r, _ := utf8.DecodeRune(nc.b)
   200  		return unicode.In(r, unicode.Nd)
   201  	}
   202  	r, _ := utf8.DecodeRuneInString(nc.s)
   203  	return unicode.In(r, unicode.Nd)
   204  }
   205  
   206  // We currently support a maximum of about 2M digits (the number of primary
   207  // values). Such numbers will compare correctly against small numbers, but their
   208  // comparison against other large numbers is undefined.
   209  //
   210  // TODO: define a proper fallback, such as comparing large numbers textually or
   211  // actually allowing numbers of unlimited length.
   212  //
   213  // TODO: cap this to a lower number (like 100) and maybe allow a larger number
   214  // in an option?
   215  const maxDigits = 1<<maxPrimaryBits - 1
   216  
   217  func (nc *numberConverter) update(elems []Elem) bool {
   218  	isZero, ok := nc.checkNextDigit(elems)
   219  	if nc.nDigits == 0 && isZero {
   220  		return true
   221  	}
   222  	nc.elems = elems
   223  	if !ok {
   224  		return false
   225  	}
   226  	nc.nDigits++
   227  	return nc.nDigits < maxDigits
   228  }
   229  
   230  // result fills in the length element for the digit sequence and returns the
   231  // completed collation elements.
   232  func (nc *numberConverter) result() []Elem {
   233  	e, _ := MakeElem(nc.nDigits, defaultSecondary, defaultTertiary, 0)
   234  	nc.elems[nc.lenIndex] = e
   235  	return nc.elems
   236  }