github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/unicode/norm/forminfo.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package norm
     6  
     7  // This file contains Form-specific logic and wrappers for data in tables.go.
     8  
     9  // Rune info is stored in a separate trie per composing form. A composing form
    10  // and its corresponding decomposing form share the same trie.  Each trie maps
    11  // a rune to a uint16. The values take two forms.  For v >= 0x8000:
    12  //   bits
    13  //   15:    1 (inverse of NFD_QD bit of qcInfo)
    14  //   13..7: qcInfo (see below). isYesD is always true (no decompostion).
    15  //    6..0: ccc (compressed CCC value).
    16  // For v < 0x8000, the respective rune has a decomposition and v is an index
    17  // into a byte array of UTF-8 decomposition sequences and additional info and
    18  // has the form:
    19  //    <header> <decomp_byte>* [<tccc> [<lccc>]]
    20  // The header contains the number of bytes in the decomposition (excluding this
    21  // length byte). The two most significant bits of this length byte correspond
    22  // to bit 5 and 4 of qcInfo (see below).  The byte sequence itself starts at v+1.
    23  // The byte sequence is followed by a trailing and leading CCC if the values
    24  // for these are not zero.  The value of v determines which ccc are appended
    25  // to the sequences.  For v < firstCCC, there are none, for v >= firstCCC,
    26  // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
    27  // there is an additional leading ccc. The value of tccc itself is the
    28  // trailing CCC shifted left 2 bits. The two least-significant bits of tccc
    29  // are the number of trailing non-starters.
    30  
    31  const (
    32  	qcInfoMask      = 0x3F // to clear all but the relevant bits in a qcInfo
    33  	headerLenMask   = 0x3F // extract the length value from the header byte
    34  	headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
    35  )
    36  
    37  // Properties provides access to normalization properties of a rune.
    38  type Properties struct {
    39  	pos   uint8  // start position in reorderBuffer; used in composition.go
    40  	size  uint8  // length of UTF-8 encoding of this rune
    41  	ccc   uint8  // leading canonical combining class (ccc if not decomposition)
    42  	tccc  uint8  // trailing canonical combining class (ccc if not decomposition)
    43  	nLead uint8  // number of leading non-starters.
    44  	flags qcInfo // quick check flags
    45  	index uint16
    46  }
    47  
    48  // functions dispatchable per form
    49  type lookupFunc func(b input, i int) Properties
    50  
    51  // formInfo holds Form-specific functions and tables.
    52  type formInfo struct {
    53  	form                     Form
    54  	composing, compatibility bool // form type
    55  	info                     lookupFunc
    56  	nextMain                 iterFunc
    57  }
    58  
    59  var formTable []*formInfo
    60  
    61  func init() {
    62  	formTable = make([]*formInfo, 4)
    63  
    64  	for i := range formTable {
    65  		f := &formInfo{}
    66  		formTable[i] = f
    67  		f.form = Form(i)
    68  		if Form(i) == NFKD || Form(i) == NFKC {
    69  			f.compatibility = true
    70  			f.info = lookupInfoNFKC
    71  		} else {
    72  			f.info = lookupInfoNFC
    73  		}
    74  		f.nextMain = nextDecomposed
    75  		if Form(i) == NFC || Form(i) == NFKC {
    76  			f.nextMain = nextComposed
    77  			f.composing = true
    78  		}
    79  	}
    80  }
    81  
    82  // We do not distinguish between boundaries for NFC, NFD, etc. to avoid
    83  // unexpected behavior for the user.  For example, in NFD, there is a boundary
    84  // after 'a'.  However, 'a' might combine with modifiers, so from the application's
    85  // perspective it is not a good boundary. We will therefore always use the
    86  // boundaries for the combining variants.
    87  
    88  // BoundaryBefore returns true if this rune starts a new segment and
    89  // cannot combine with any rune on the left.
    90  func (p Properties) BoundaryBefore() bool {
    91  	if p.ccc == 0 && !p.combinesBackward() {
    92  		return true
    93  	}
    94  	// We assume that the CCC of the first character in a decomposition
    95  	// is always non-zero if different from info.ccc and that we can return
    96  	// false at this point. This is verified by maketables.
    97  	return false
    98  }
    99  
   100  // BoundaryAfter returns true if runes cannot combine with or otherwise
   101  // interact with this or previous runes.
   102  func (p Properties) BoundaryAfter() bool {
   103  	// TODO: loosen these conditions.
   104  	return p.isInert()
   105  }
   106  
   107  // We pack quick check data in 4 bits:
   108  //   5:    Combines forward  (0 == false, 1 == true)
   109  //   4..3: NFC_QC Yes(00), No (10), or Maybe (11)
   110  //   2:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
   111  //   1..0: Number of trailing non-starters.
   112  //
   113  // When all 4 bits are zero, the character is inert, meaning it is never
   114  // influenced by normalization.
   115  type qcInfo uint8
   116  
   117  func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
   118  func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
   119  
   120  func (p Properties) combinesForward() bool  { return p.flags&0x20 != 0 }
   121  func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
   122  func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
   123  
   124  func (p Properties) isInert() bool {
   125  	return p.flags&qcInfoMask == 0 && p.ccc == 0
   126  }
   127  
   128  func (p Properties) multiSegment() bool {
   129  	return p.index >= firstMulti && p.index < endMulti
   130  }
   131  
   132  func (p Properties) nLeadingNonStarters() uint8 {
   133  	return p.nLead
   134  }
   135  
   136  func (p Properties) nTrailingNonStarters() uint8 {
   137  	return uint8(p.flags & 0x03)
   138  }
   139  
   140  // Decomposition returns the decomposition for the underlying rune
   141  // or nil if there is none.
   142  func (p Properties) Decomposition() []byte {
   143  	// TODO: create the decomposition for Hangul?
   144  	if p.index == 0 {
   145  		return nil
   146  	}
   147  	i := p.index
   148  	n := decomps[i] & headerLenMask
   149  	i++
   150  	return decomps[i : i+uint16(n)]
   151  }
   152  
   153  // Size returns the length of UTF-8 encoding of the rune.
   154  func (p Properties) Size() int {
   155  	return int(p.size)
   156  }
   157  
   158  // CCC returns the canonical combining class of the underlying rune.
   159  func (p Properties) CCC() uint8 {
   160  	if p.index >= firstCCCZeroExcept {
   161  		return 0
   162  	}
   163  	return ccc[p.ccc]
   164  }
   165  
   166  // LeadCCC returns the CCC of the first rune in the decomposition.
   167  // If there is no decomposition, LeadCCC equals CCC.
   168  func (p Properties) LeadCCC() uint8 {
   169  	return ccc[p.ccc]
   170  }
   171  
   172  // TrailCCC returns the CCC of the last rune in the decomposition.
   173  // If there is no decomposition, TrailCCC equals CCC.
   174  func (p Properties) TrailCCC() uint8 {
   175  	return ccc[p.tccc]
   176  }
   177  
   178  // Recomposition
   179  // We use 32-bit keys instead of 64-bit for the two codepoint keys.
   180  // This clips off the bits of three entries, but we know this will not
   181  // result in a collision. In the unlikely event that changes to
   182  // UnicodeData.txt introduce collisions, the compiler will catch it.
   183  // Note that the recomposition map for NFC and NFKC are identical.
   184  
   185  // combine returns the combined rune or 0 if it doesn't exist.
   186  func combine(a, b rune) rune {
   187  	key := uint32(uint16(a))<<16 + uint32(uint16(b))
   188  	return recompMap[key]
   189  }
   190  
   191  func lookupInfoNFC(b input, i int) Properties {
   192  	v, sz := b.charinfoNFC(i)
   193  	return compInfo(v, sz)
   194  }
   195  
   196  func lookupInfoNFKC(b input, i int) Properties {
   197  	v, sz := b.charinfoNFKC(i)
   198  	return compInfo(v, sz)
   199  }
   200  
   201  // Properties returns properties for the first rune in s.
   202  func (f Form) Properties(s []byte) Properties {
   203  	if f == NFC || f == NFD {
   204  		return compInfo(nfcData.lookup(s))
   205  	}
   206  	return compInfo(nfkcData.lookup(s))
   207  }
   208  
   209  // PropertiesString returns properties for the first rune in s.
   210  func (f Form) PropertiesString(s string) Properties {
   211  	if f == NFC || f == NFD {
   212  		return compInfo(nfcData.lookupString(s))
   213  	}
   214  	return compInfo(nfkcData.lookupString(s))
   215  }
   216  
   217  // compInfo converts the information contained in v and sz
   218  // to a Properties.  See the comment at the top of the file
   219  // for more information on the format.
   220  func compInfo(v uint16, sz int) Properties {
   221  	if v == 0 {
   222  		return Properties{size: uint8(sz)}
   223  	} else if v >= 0x8000 {
   224  		p := Properties{
   225  			size:  uint8(sz),
   226  			ccc:   uint8(v),
   227  			tccc:  uint8(v),
   228  			flags: qcInfo(v >> 8),
   229  		}
   230  		if p.ccc > 0 || p.combinesBackward() {
   231  			p.nLead = uint8(p.flags & 0x3)
   232  		}
   233  		return p
   234  	}
   235  	// has decomposition
   236  	h := decomps[v]
   237  	f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
   238  	p := Properties{size: uint8(sz), flags: f, index: v}
   239  	if v >= firstCCC {
   240  		v += uint16(h&headerLenMask) + 1
   241  		c := decomps[v]
   242  		p.tccc = c >> 2
   243  		p.flags |= qcInfo(c & 0x3)
   244  		if v >= firstLeadingCCC {
   245  			p.nLead = c & 0x3
   246  			if v >= firstStarterWithNLead {
   247  				// We were tricked. Remove the decomposition.
   248  				p.flags &= 0x03
   249  				p.index = 0
   250  				return p
   251  			}
   252  			p.ccc = decomps[v+1]
   253  		}
   254  	}
   255  	return p
   256  }