github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/colltab/table.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package colltab 6 7 import ( 8 "unicode/utf8" 9 10 "github.com/go-xe2/third/golang.org/x/text/unicode/norm" 11 ) 12 13 // Table holds all collation data for a given collation ordering. 14 type Table struct { 15 Index Trie // main trie 16 17 // expansion info 18 ExpandElem []uint32 19 20 // contraction info 21 ContractTries ContractTrieSet 22 ContractElem []uint32 23 MaxContractLen int 24 VariableTop uint32 25 } 26 27 func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) { 28 return t.appendNext(w, source{bytes: b}) 29 } 30 31 func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) { 32 return t.appendNext(w, source{str: s}) 33 } 34 35 func (t *Table) Start(p int, b []byte) int { 36 // TODO: implement 37 panic("not implemented") 38 } 39 40 func (t *Table) StartString(p int, s string) int { 41 // TODO: implement 42 panic("not implemented") 43 } 44 45 func (t *Table) Domain() []string { 46 // TODO: implement 47 panic("not implemented") 48 } 49 50 func (t *Table) Top() uint32 { 51 return t.VariableTop 52 } 53 54 type source struct { 55 str string 56 bytes []byte 57 } 58 59 func (src *source) lookup(t *Table) (ce Elem, sz int) { 60 if src.bytes == nil { 61 return t.Index.lookupString(src.str) 62 } 63 return t.Index.lookup(src.bytes) 64 } 65 66 func (src *source) tail(sz int) { 67 if src.bytes == nil { 68 src.str = src.str[sz:] 69 } else { 70 src.bytes = src.bytes[sz:] 71 } 72 } 73 74 func (src *source) nfd(buf []byte, end int) []byte { 75 if src.bytes == nil { 76 return norm.NFD.AppendString(buf[:0], src.str[:end]) 77 } 78 return norm.NFD.Append(buf[:0], src.bytes[:end]...) 79 } 80 81 func (src *source) rune() (r rune, sz int) { 82 if src.bytes == nil { 83 return utf8.DecodeRuneInString(src.str) 84 } 85 return utf8.DecodeRune(src.bytes) 86 } 87 88 func (src *source) properties(f norm.Form) norm.Properties { 89 if src.bytes == nil { 90 return f.PropertiesString(src.str) 91 } 92 return f.Properties(src.bytes) 93 } 94 95 // appendNext appends the weights corresponding to the next rune or 96 // contraction in s. If a contraction is matched to a discontinuous 97 // sequence of runes, the weights for the interstitial runes are 98 // appended as well. It returns a new slice that includes the appended 99 // weights and the number of bytes consumed from s. 100 func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) { 101 ce, sz := src.lookup(t) 102 tp := ce.ctype() 103 if tp == ceNormal { 104 if ce == 0 { 105 r, _ := src.rune() 106 const ( 107 hangulSize = 3 108 firstHangul = 0xAC00 109 lastHangul = 0xD7A3 110 ) 111 if r >= firstHangul && r <= lastHangul { 112 // TODO: performance can be considerably improved here. 113 n = sz 114 var buf [16]byte // Used for decomposing Hangul. 115 for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] { 116 ce, sz = t.Index.lookup(b) 117 w = append(w, ce) 118 } 119 return w, n 120 } 121 ce = makeImplicitCE(implicitPrimary(r)) 122 } 123 w = append(w, ce) 124 } else if tp == ceExpansionIndex { 125 w = t.appendExpansion(w, ce) 126 } else if tp == ceContractionIndex { 127 n := 0 128 src.tail(sz) 129 if src.bytes == nil { 130 w, n = t.matchContractionString(w, ce, src.str) 131 } else { 132 w, n = t.matchContraction(w, ce, src.bytes) 133 } 134 sz += n 135 } else if tp == ceDecompose { 136 // Decompose using NFKD and replace tertiary weights. 137 t1, t2 := splitDecompose(ce) 138 i := len(w) 139 nfkd := src.properties(norm.NFKD).Decomposition() 140 for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] { 141 w, p = t.appendNext(w, source{bytes: nfkd}) 142 } 143 w[i] = w[i].updateTertiary(t1) 144 if i++; i < len(w) { 145 w[i] = w[i].updateTertiary(t2) 146 for i++; i < len(w); i++ { 147 w[i] = w[i].updateTertiary(maxTertiary) 148 } 149 } 150 } 151 return w, sz 152 } 153 154 func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem { 155 i := splitExpandIndex(ce) 156 n := int(t.ExpandElem[i]) 157 i++ 158 for _, ce := range t.ExpandElem[i : i+n] { 159 w = append(w, Elem(ce)) 160 } 161 return w 162 } 163 164 func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) { 165 index, n, offset := splitContractIndex(ce) 166 167 scan := t.ContractTries.scanner(index, n, suffix) 168 buf := [norm.MaxSegmentSize]byte{} 169 bufp := 0 170 p := scan.scan(0) 171 172 if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf { 173 // By now we should have filtered most cases. 174 p0 := p 175 bufn := 0 176 rune := norm.NFD.Properties(suffix[p:]) 177 p += rune.Size() 178 if rune.LeadCCC() != 0 { 179 prevCC := rune.TrailCCC() 180 // A gap may only occur in the last normalization segment. 181 // This also ensures that len(scan.s) < norm.MaxSegmentSize. 182 if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 { 183 scan.s = suffix[:p+end] 184 } 185 for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf { 186 rune = norm.NFD.Properties(suffix[p:]) 187 if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc { 188 break 189 } 190 prevCC = rune.TrailCCC() 191 if pp := scan.scan(p); pp != p { 192 // Copy the interstitial runes for later processing. 193 bufn += copy(buf[bufn:], suffix[p0:p]) 194 if scan.pindex == pp { 195 bufp = bufn 196 } 197 p, p0 = pp, pp 198 } else { 199 p += rune.Size() 200 } 201 } 202 } 203 } 204 // Append weights for the matched contraction, which may be an expansion. 205 i, n := scan.result() 206 ce = Elem(t.ContractElem[i+offset]) 207 if ce.ctype() == ceNormal { 208 w = append(w, ce) 209 } else { 210 w = t.appendExpansion(w, ce) 211 } 212 // Append weights for the runes in the segment not part of the contraction. 213 for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] { 214 w, p = t.appendNext(w, source{bytes: b}) 215 } 216 return w, n 217 } 218 219 // TODO: unify the two implementations. This is best done after first simplifying 220 // the algorithm taking into account the inclusion of both NFC and NFD forms 221 // in the table. 222 func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) { 223 index, n, offset := splitContractIndex(ce) 224 225 scan := t.ContractTries.scannerString(index, n, suffix) 226 buf := [norm.MaxSegmentSize]byte{} 227 bufp := 0 228 p := scan.scan(0) 229 230 if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf { 231 // By now we should have filtered most cases. 232 p0 := p 233 bufn := 0 234 rune := norm.NFD.PropertiesString(suffix[p:]) 235 p += rune.Size() 236 if rune.LeadCCC() != 0 { 237 prevCC := rune.TrailCCC() 238 // A gap may only occur in the last normalization segment. 239 // This also ensures that len(scan.s) < norm.MaxSegmentSize. 240 if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 { 241 scan.s = suffix[:p+end] 242 } 243 for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf { 244 rune = norm.NFD.PropertiesString(suffix[p:]) 245 if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc { 246 break 247 } 248 prevCC = rune.TrailCCC() 249 if pp := scan.scan(p); pp != p { 250 // Copy the interstitial runes for later processing. 251 bufn += copy(buf[bufn:], suffix[p0:p]) 252 if scan.pindex == pp { 253 bufp = bufn 254 } 255 p, p0 = pp, pp 256 } else { 257 p += rune.Size() 258 } 259 } 260 } 261 } 262 // Append weights for the matched contraction, which may be an expansion. 263 i, n := scan.result() 264 ce = Elem(t.ContractElem[i+offset]) 265 if ce.ctype() == ceNormal { 266 w = append(w, ce) 267 } else { 268 w = t.appendExpansion(w, ce) 269 } 270 // Append weights for the runes in the segment not part of the contraction. 271 for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] { 272 w, p = t.appendNext(w, source{bytes: b}) 273 } 274 return w, n 275 }