vitess.io/vitess@v0.16.2/go/mysql/collations/tools/makecolldata/codegen/tablegen.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package codegen 18 19 import ( 20 "bytes" 21 "crypto/sha256" 22 "encoding/hex" 23 "fmt" 24 "log" 25 "math/bits" 26 "os" 27 "reflect" 28 29 "vitess.io/vitess/go/mysql/collations/internal/uca" 30 ) 31 32 type LiteralPageGenerator struct { 33 index map[string]string 34 } 35 36 func (pg *LiteralPageGenerator) WritePage16(g *Generator, varname string, values []uint16) string { 37 hash := hashWeights(values) 38 if existing, ok := pg.index[hash]; ok { 39 return "&" + existing 40 } 41 42 pg.index[hash] = varname 43 g.P("var ", varname, " = []uint16{") 44 45 for col, w := range values { 46 if col > 0 && col%32 == 0 { 47 g.WriteByte('\n') 48 } 49 fmt.Fprintf(g, "0x%04x,", w) 50 } 51 g.P("}") 52 return "&" + varname 53 } 54 55 func WriteFastPage32(g *Generator, varname string, values []uint32) { 56 if len(values) != 256 { 57 panic("WritePage32: page does not have 256 values") 58 } 59 g.P("var fast", varname, " = ", Array32(values)) 60 } 61 62 type EmbedPageGenerator struct { 63 index map[string]string 64 raw bytes.Buffer 65 } 66 67 func hashWeights(values []uint16) string { 68 h := sha256.New() 69 for _, v := range values { 70 h.Write([]byte{byte(v >> 8), byte(v)}) 71 } 72 return hex.EncodeToString(h.Sum(nil)) 73 } 74 75 func (pg *EmbedPageGenerator) WritePage16(g *Generator, varname string, values []uint16) string { 76 hash := hashWeights(values) 77 if existing, ok := pg.index[hash]; ok { 78 return "&" + existing 79 } 80 81 pg.index[hash] = varname 82 83 g.P("var ", varname, " = weightsUCA_embed(", pg.raw.Len()/2, ", ", len(values), ")") 84 85 for _, v := range values { 86 pg.raw.WriteByte(byte(v)) 87 pg.raw.WriteByte(byte(v >> 8)) 88 } 89 return "&" + varname 90 } 91 92 func (pg *EmbedPageGenerator) WriteTrailer(g *Generator, embedfile string) { 93 unsafe := Package("unsafe") 94 reflect := Package("reflect") 95 g.UsePackage("embed") 96 97 g.P() 98 g.P("//go:embed ", embedfile) 99 g.P("var weightsUCA_embed_data string") 100 g.P() 101 g.P("func weightsUCA_embed(pos, length int) []uint16 {") 102 g.P("return (*[0x7fff0000]uint16)(", unsafe, ".Pointer((*", reflect, ".StringHeader)(", unsafe, ".Pointer(&weightsUCA_embed_data)).Data))[pos:pos+length]") 103 g.P("}") 104 } 105 106 func (pg *EmbedPageGenerator) WriteToFile(out string) { 107 if err := os.WriteFile(out, pg.raw.Bytes(), 0644); err != nil { 108 log.Fatal(err) 109 } 110 log.Printf("written %q (%.02fkb)", out, float64(pg.raw.Len())/1024.0) 111 } 112 113 type PageGenerator interface { 114 WritePage16(g *Generator, varname string, values []uint16) string 115 } 116 117 func NewPageGenerator(embed bool) PageGenerator { 118 index := make(map[string]string) 119 if embed { 120 return &EmbedPageGenerator{index: index} 121 } 122 return &LiteralPageGenerator{index: index} 123 } 124 125 type entry struct { 126 weights []uint16 127 } 128 129 func (e *entry) adjustHangulWeights(tb *TableGenerator, jamos []rune) { 130 for _, jamo := range jamos { 131 _, entry := tb.entryForCodepoint(jamo) 132 e.weights = append(e.weights, entry.weights[0], entry.weights[1], entry.weights[2]+1) 133 } 134 } 135 136 type page struct { 137 n int 138 entryCount int 139 entries [uca.CodepointsPerPage]entry 140 } 141 142 func (p *page) equals(other *page) bool { 143 return reflect.DeepEqual(p, other) 144 } 145 146 func (p *page) name(uca string) string { 147 if p.entryCount == 0 { 148 panic("cannot name empty page") 149 } 150 return fmt.Sprintf("weightTable_%s_page%03X", uca, p.n) 151 } 152 153 func (p *page) findMaxCollationElements() int { 154 var weightn int 155 for _, entry := range p.entries { 156 if len(entry.weights) > weightn { 157 weightn = len(entry.weights) 158 } 159 } 160 return weightn 161 } 162 163 func (p *page) weights900Fast(level int) (w []uint32) { 164 if p.entryCount == 0 { 165 return nil 166 } 167 for i := 0; i < 128; i++ { 168 entry := &p.entries[i] 169 if len(entry.weights) > 3 { 170 panic("trying to dump fast weights for codepoint with >3 weights") 171 } 172 var weight uint32 173 if level < len(entry.weights) { 174 weight = uint32(bits.ReverseBytes16(entry.weights[level])) 175 } 176 if weight != 0 { 177 weight |= 0x20000 178 } 179 w = append(w, weight) 180 } 181 for i := 0; i < 128; i++ { 182 w = append(w, 0x0) 183 } 184 return 185 } 186 187 func (p *page) weights900() (w []uint16) { 188 if p.entryCount == 0 { 189 return nil 190 } 191 maxCollations := p.findMaxCollationElements() 192 for _, entry := range p.entries { 193 w = append(w, uint16(len(entry.weights)/3)) 194 } 195 for level := 0; level < maxCollations; level++ { 196 for _, entry := range p.entries { 197 var weight uint16 198 if level < len(entry.weights) { 199 weight = entry.weights[level] 200 } 201 w = append(w, weight) 202 } 203 } 204 return 205 } 206 207 func (p *page) weightsLegacy() (w []uint16) { 208 if p.entryCount == 0 { 209 return nil 210 } 211 stride := p.findMaxCollationElements() 212 w = append(w, uint16(stride)) 213 for _, entry := range p.entries { 214 var i int 215 for i < len(entry.weights) { 216 w = append(w, entry.weights[i]) 217 i++ 218 } 219 for i < stride { 220 w = append(w, 0x0) 221 i++ 222 } 223 } 224 return 225 } 226 227 type TableGenerator struct { 228 pages []page 229 maxChar rune 230 ucav string 231 pg PageGenerator 232 } 233 234 func (tg *TableGenerator) entryForCodepoint(codepoint rune) (*page, *entry) { 235 page := &tg.pages[int(codepoint)/uca.CodepointsPerPage] 236 entry := &page.entries[int(codepoint)%uca.CodepointsPerPage] 237 return page, entry 238 } 239 240 func (tg *TableGenerator) Add900(codepoint rune, rhs [][3]uint16) { 241 page, entry := tg.entryForCodepoint(codepoint) 242 page.entryCount++ 243 244 for i, weights := range rhs { 245 if i >= uca.MaxCollationElementsPerCodepoint { 246 break 247 } 248 for _, we := range weights { 249 entry.weights = append(entry.weights, we) 250 } 251 } 252 } 253 254 func (tg *TableGenerator) Add(codepoint rune, weights []uint16) { 255 page, entry := tg.entryForCodepoint(codepoint) 256 page.entryCount++ 257 258 if entry.weights != nil { 259 panic("duplicate codepoint inserted") 260 } 261 entry.weights = append(entry.weights, weights...) 262 } 263 264 func (tg *TableGenerator) AddFromAllkeys(lhs []rune, rhs [][]int, vars []int) { 265 if len(lhs) > 1 || lhs[0] > tg.maxChar { 266 // TODO: support contractions 267 return 268 } 269 270 var weights [][3]uint16 271 for _, we := range rhs { 272 if len(we) != 3 { 273 panic("non-triplet weight in allkeys.txt") 274 } 275 weights = append(weights, [3]uint16{uint16(we[0]), uint16(we[1]), uint16(we[2])}) 276 } 277 tg.Add900(lhs[0], weights) 278 } 279 280 func (tg *TableGenerator) writePage(g *Generator, p *page, layout uca.Layout) string { 281 var weights []uint16 282 283 switch layout.(type) { 284 case uca.Layout_uca900: 285 weights = p.weights900() 286 case uca.Layout_uca_legacy: 287 weights = p.weightsLegacy() 288 } 289 290 if len(weights) == 0 { 291 return "nil" 292 } 293 return tg.pg.WritePage16(g, p.name(tg.ucav), weights) 294 } 295 296 func (tg *TableGenerator) WriteTables(g *Generator, layout uca.Layout) { 297 var pagePtrs []string 298 for _, page := range tg.pages { 299 pagePtrs = append(pagePtrs, tg.writePage(g, &page, layout)) 300 } 301 302 g.P("var weightTable_", tg.ucav, " = []*[]uint16{") 303 for col, pageptr := range pagePtrs { 304 if col > 0 && col%32 == 0 { 305 g.WriteByte('\n') 306 } 307 g.WriteString(pageptr) 308 g.WriteByte(',') 309 } 310 g.P("}") 311 } 312 313 func (tg *TableGenerator) WriteFastTables(g *Generator, layout uca.Layout) { 314 switch layout.(type) { 315 case uca.Layout_uca900: 316 default: 317 panic("unsupported table layout for FastTables") 318 } 319 320 ascii := &tg.pages[0] 321 WriteFastPage32(g, ascii.name(tg.ucav)+"L0", ascii.weights900Fast(0)) 322 WriteFastPage32(g, ascii.name(tg.ucav)+"L1", ascii.weights900Fast(1)) 323 WriteFastPage32(g, ascii.name(tg.ucav)+"L2", ascii.weights900Fast(2)) 324 } 325 326 func NewTableGenerator(ucav string, pagebuilder PageGenerator) *TableGenerator { 327 var maxChar rune 328 switch ucav { 329 case "uca520", "uca900", "uca900_zh", "uca900_ja": 330 maxChar = uca.MaxCodepoint 331 case "uca400": 332 maxChar = 0xFFFF + 1 333 default: 334 panic("unknown UCA version") 335 } 336 337 tb := &TableGenerator{ 338 pages: make([]page, maxChar/uca.CodepointsPerPage), 339 maxChar: maxChar, 340 ucav: ucav, 341 pg: pagebuilder, 342 } 343 344 for n := range tb.pages { 345 tb.pages[n].n = n 346 } 347 348 return tb 349 }