github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/collate/collate.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // TODO: remove hard-coded versions when we have implemented fractional weights.
     6  // The current implementation is incompatible with later CLDR versions.
     7  //go:generate go run maketables.go -cldr=23 -unicode=6.2.0
     8  
     9  // Package collate contains types for comparing and sorting Unicode strings
    10  // according to a given collation order.  Package locale provides a high-level
    11  // interface to collation. Users should typically use that package instead.
    12  package collate // import "golang.org/x/text/collate"
    13  
    14  import (
    15  	"bytes"
    16  	"strings"
    17  
    18  	"golang.org/x/text/collate/colltab"
    19  	newcolltab "golang.org/x/text/internal/colltab"
    20  	"golang.org/x/text/language"
    21  )
    22  
    23  // Collator provides functionality for comparing strings for a given
    24  // collation order.
    25  type Collator struct {
    26  	options
    27  
    28  	sorter sorter
    29  
    30  	_iter [2]iter
    31  }
    32  
    33  func (c *Collator) iter(i int) *iter {
    34  	// TODO: evaluate performance for making the second iterator optional.
    35  	return &c._iter[i]
    36  }
    37  
    38  // Supported returns the list of languages for which collating differs from its parent.
    39  func Supported() []language.Tag {
    40  	// TODO: use language.Coverage instead.
    41  
    42  	t := make([]language.Tag, len(tags))
    43  	copy(t, tags)
    44  	return t
    45  }
    46  
    47  func init() {
    48  	ids := strings.Split(availableLocales, ",")
    49  	tags = make([]language.Tag, len(ids))
    50  	for i, s := range ids {
    51  		tags[i] = language.Raw.MustParse(s)
    52  	}
    53  }
    54  
    55  var tags []language.Tag
    56  
    57  // New returns a new Collator initialized for the given locale.
    58  func New(t language.Tag, o ...Option) *Collator {
    59  	index := newcolltab.MatchLang(t, tags)
    60  	c := newCollator(colltab.Init(locales[index]))
    61  
    62  	// Set options from the user-supplied tag.
    63  	c.setFromTag(t)
    64  
    65  	// Set the user-supplied options.
    66  	c.setOptions(o)
    67  
    68  	c.init()
    69  	return c
    70  }
    71  
    72  // NewFromTable returns a new Collator for the given Weighter.
    73  func NewFromTable(w colltab.Weighter, o ...Option) *Collator {
    74  	c := newCollator(w)
    75  	c.setOptions(o)
    76  	c.init()
    77  	return c
    78  }
    79  
    80  func (c *Collator) init() {
    81  	if c.numeric {
    82  		c.t = colltab.NewNumericWeighter(c.t)
    83  	}
    84  	c._iter[0].init(c)
    85  	c._iter[1].init(c)
    86  }
    87  
    88  // Buffer holds keys generated by Key and KeyString.
    89  type Buffer struct {
    90  	buf [4096]byte
    91  	key []byte
    92  }
    93  
    94  func (b *Buffer) init() {
    95  	if b.key == nil {
    96  		b.key = b.buf[:0]
    97  	}
    98  }
    99  
   100  // Reset clears the buffer from previous results generated by Key and KeyString.
   101  func (b *Buffer) Reset() {
   102  	b.key = b.key[:0]
   103  }
   104  
   105  // Compare returns an integer comparing the two byte slices.
   106  // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
   107  func (c *Collator) Compare(a, b []byte) int {
   108  	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
   109  	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
   110  	c.iter(0).SetInput(a)
   111  	c.iter(1).SetInput(b)
   112  	if res := c.compare(); res != 0 {
   113  		return res
   114  	}
   115  	if !c.ignore[colltab.Identity] {
   116  		return bytes.Compare(a, b)
   117  	}
   118  	return 0
   119  }
   120  
   121  // CompareString returns an integer comparing the two strings.
   122  // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
   123  func (c *Collator) CompareString(a, b string) int {
   124  	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
   125  	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
   126  	c.iter(0).SetInputString(a)
   127  	c.iter(1).SetInputString(b)
   128  	if res := c.compare(); res != 0 {
   129  		return res
   130  	}
   131  	if !c.ignore[colltab.Identity] {
   132  		if a < b {
   133  			return -1
   134  		} else if a > b {
   135  			return 1
   136  		}
   137  	}
   138  	return 0
   139  }
   140  
   141  func compareLevel(f func(i *iter) int, a, b *iter) int {
   142  	a.pce = 0
   143  	b.pce = 0
   144  	for {
   145  		va := f(a)
   146  		vb := f(b)
   147  		if va != vb {
   148  			if va < vb {
   149  				return -1
   150  			}
   151  			return 1
   152  		} else if va == 0 {
   153  			break
   154  		}
   155  	}
   156  	return 0
   157  }
   158  
   159  func (c *Collator) compare() int {
   160  	ia, ib := c.iter(0), c.iter(1)
   161  	// Process primary level
   162  	if c.alternate != altShifted {
   163  		// TODO: implement script reordering
   164  		if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
   165  			return res
   166  		}
   167  	} else {
   168  		// TODO: handle shifted
   169  	}
   170  	if !c.ignore[colltab.Secondary] {
   171  		f := (*iter).nextSecondary
   172  		if c.backwards {
   173  			f = (*iter).prevSecondary
   174  		}
   175  		if res := compareLevel(f, ia, ib); res != 0 {
   176  			return res
   177  		}
   178  	}
   179  	// TODO: special case handling (Danish?)
   180  	if !c.ignore[colltab.Tertiary] || c.caseLevel {
   181  		if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
   182  			return res
   183  		}
   184  		if !c.ignore[colltab.Quaternary] {
   185  			if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
   186  				return res
   187  			}
   188  		}
   189  	}
   190  	return 0
   191  }
   192  
   193  // Key returns the collation key for str.
   194  // Passing the buffer buf may avoid memory allocations.
   195  // The returned slice will point to an allocation in Buffer and will remain
   196  // valid until the next call to buf.Reset().
   197  func (c *Collator) Key(buf *Buffer, str []byte) []byte {
   198  	// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
   199  	buf.init()
   200  	return c.key(buf, c.getColElems(str))
   201  }
   202  
   203  // KeyFromString returns the collation key for str.
   204  // Passing the buffer buf may avoid memory allocations.
   205  // The returned slice will point to an allocation in Buffer and will retain
   206  // valid until the next call to buf.ResetKeys().
   207  func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
   208  	// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
   209  	buf.init()
   210  	return c.key(buf, c.getColElemsString(str))
   211  }
   212  
   213  func (c *Collator) key(buf *Buffer, w []colltab.Elem) []byte {
   214  	processWeights(c.alternate, c.t.Top(), w)
   215  	kn := len(buf.key)
   216  	c.keyFromElems(buf, w)
   217  	return buf.key[kn:]
   218  }
   219  
   220  func (c *Collator) getColElems(str []byte) []colltab.Elem {
   221  	i := c.iter(0)
   222  	i.SetInput(str)
   223  	for i.Next() {
   224  	}
   225  	return i.Elems
   226  }
   227  
   228  func (c *Collator) getColElemsString(str string) []colltab.Elem {
   229  	i := c.iter(0)
   230  	i.SetInputString(str)
   231  	for i.Next() {
   232  	}
   233  	return i.Elems
   234  }
   235  
   236  type iter struct {
   237  	wa [512]colltab.Elem
   238  
   239  	newcolltab.Iter
   240  	pce int
   241  }
   242  
   243  func (i *iter) init(c *Collator) {
   244  	i.Weighter = c.t
   245  	i.Elems = i.wa[:0]
   246  }
   247  
   248  func (i *iter) nextPrimary() int {
   249  	for {
   250  		for ; i.pce < i.N; i.pce++ {
   251  			if v := i.Elems[i.pce].Primary(); v != 0 {
   252  				i.pce++
   253  				return v
   254  			}
   255  		}
   256  		if !i.Next() {
   257  			return 0
   258  		}
   259  	}
   260  	panic("should not reach here")
   261  }
   262  
   263  func (i *iter) nextSecondary() int {
   264  	for ; i.pce < len(i.Elems); i.pce++ {
   265  		if v := i.Elems[i.pce].Secondary(); v != 0 {
   266  			i.pce++
   267  			return v
   268  		}
   269  	}
   270  	return 0
   271  }
   272  
   273  func (i *iter) prevSecondary() int {
   274  	for ; i.pce < len(i.Elems); i.pce++ {
   275  		if v := i.Elems[len(i.Elems)-i.pce-1].Secondary(); v != 0 {
   276  			i.pce++
   277  			return v
   278  		}
   279  	}
   280  	return 0
   281  }
   282  
   283  func (i *iter) nextTertiary() int {
   284  	for ; i.pce < len(i.Elems); i.pce++ {
   285  		if v := i.Elems[i.pce].Tertiary(); v != 0 {
   286  			i.pce++
   287  			return int(v)
   288  		}
   289  	}
   290  	return 0
   291  }
   292  
   293  func (i *iter) nextQuaternary() int {
   294  	for ; i.pce < len(i.Elems); i.pce++ {
   295  		if v := i.Elems[i.pce].Quaternary(); v != 0 {
   296  			i.pce++
   297  			return v
   298  		}
   299  	}
   300  	return 0
   301  }
   302  
   303  func appendPrimary(key []byte, p int) []byte {
   304  	// Convert to variable length encoding; supports up to 23 bits.
   305  	if p <= 0x7FFF {
   306  		key = append(key, uint8(p>>8), uint8(p))
   307  	} else {
   308  		key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p))
   309  	}
   310  	return key
   311  }
   312  
   313  // keyFromElems converts the weights ws to a compact sequence of bytes.
   314  // The result will be appended to the byte buffer in buf.
   315  func (c *Collator) keyFromElems(buf *Buffer, ws []colltab.Elem) {
   316  	for _, v := range ws {
   317  		if w := v.Primary(); w > 0 {
   318  			buf.key = appendPrimary(buf.key, w)
   319  		}
   320  	}
   321  	if !c.ignore[colltab.Secondary] {
   322  		buf.key = append(buf.key, 0, 0)
   323  		// TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
   324  		if !c.backwards {
   325  			for _, v := range ws {
   326  				if w := v.Secondary(); w > 0 {
   327  					buf.key = append(buf.key, uint8(w>>8), uint8(w))
   328  				}
   329  			}
   330  		} else {
   331  			for i := len(ws) - 1; i >= 0; i-- {
   332  				if w := ws[i].Secondary(); w > 0 {
   333  					buf.key = append(buf.key, uint8(w>>8), uint8(w))
   334  				}
   335  			}
   336  		}
   337  	} else if c.caseLevel {
   338  		buf.key = append(buf.key, 0, 0)
   339  	}
   340  	if !c.ignore[colltab.Tertiary] || c.caseLevel {
   341  		buf.key = append(buf.key, 0, 0)
   342  		for _, v := range ws {
   343  			if w := v.Tertiary(); w > 0 {
   344  				buf.key = append(buf.key, uint8(w))
   345  			}
   346  		}
   347  		// Derive the quaternary weights from the options and other levels.
   348  		// Note that we represent MaxQuaternary as 0xFF. The first byte of the
   349  		// representation of a primary weight is always smaller than 0xFF,
   350  		// so using this single byte value will compare correctly.
   351  		if !c.ignore[colltab.Quaternary] && c.alternate >= altShifted {
   352  			if c.alternate == altShiftTrimmed {
   353  				lastNonFFFF := len(buf.key)
   354  				buf.key = append(buf.key, 0)
   355  				for _, v := range ws {
   356  					if w := v.Quaternary(); w == colltab.MaxQuaternary {
   357  						buf.key = append(buf.key, 0xFF)
   358  					} else if w > 0 {
   359  						buf.key = appendPrimary(buf.key, w)
   360  						lastNonFFFF = len(buf.key)
   361  					}
   362  				}
   363  				buf.key = buf.key[:lastNonFFFF]
   364  			} else {
   365  				buf.key = append(buf.key, 0)
   366  				for _, v := range ws {
   367  					if w := v.Quaternary(); w == colltab.MaxQuaternary {
   368  						buf.key = append(buf.key, 0xFF)
   369  					} else if w > 0 {
   370  						buf.key = appendPrimary(buf.key, w)
   371  					}
   372  				}
   373  			}
   374  		}
   375  	}
   376  }
   377  
   378  func processWeights(vw alternateHandling, top uint32, wa []colltab.Elem) {
   379  	ignore := false
   380  	vtop := int(top)
   381  	switch vw {
   382  	case altShifted, altShiftTrimmed:
   383  		for i := range wa {
   384  			if p := wa[i].Primary(); p <= vtop && p != 0 {
   385  				wa[i] = colltab.MakeQuaternary(p)
   386  				ignore = true
   387  			} else if p == 0 {
   388  				if ignore {
   389  					wa[i] = colltab.Ignore
   390  				}
   391  			} else {
   392  				ignore = false
   393  			}
   394  		}
   395  	case altBlanked:
   396  		for i := range wa {
   397  			if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) {
   398  				wa[i] = colltab.Ignore
   399  				ignore = true
   400  			} else {
   401  				ignore = false
   402  			}
   403  		}
   404  	}
   405  }