vitess.io/vitess@v0.16.2/go/mysql/collations/tools/makecolldata/codegen/tablegen.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package codegen
    18  
    19  import (
    20  	"bytes"
    21  	"crypto/sha256"
    22  	"encoding/hex"
    23  	"fmt"
    24  	"log"
    25  	"math/bits"
    26  	"os"
    27  	"reflect"
    28  
    29  	"vitess.io/vitess/go/mysql/collations/internal/uca"
    30  )
    31  
    32  type LiteralPageGenerator struct {
    33  	index map[string]string
    34  }
    35  
    36  func (pg *LiteralPageGenerator) WritePage16(g *Generator, varname string, values []uint16) string {
    37  	hash := hashWeights(values)
    38  	if existing, ok := pg.index[hash]; ok {
    39  		return "&" + existing
    40  	}
    41  
    42  	pg.index[hash] = varname
    43  	g.P("var ", varname, " = []uint16{")
    44  
    45  	for col, w := range values {
    46  		if col > 0 && col%32 == 0 {
    47  			g.WriteByte('\n')
    48  		}
    49  		fmt.Fprintf(g, "0x%04x,", w)
    50  	}
    51  	g.P("}")
    52  	return "&" + varname
    53  }
    54  
    55  func WriteFastPage32(g *Generator, varname string, values []uint32) {
    56  	if len(values) != 256 {
    57  		panic("WritePage32: page does not have 256 values")
    58  	}
    59  	g.P("var fast", varname, " = ", Array32(values))
    60  }
    61  
    62  type EmbedPageGenerator struct {
    63  	index map[string]string
    64  	raw   bytes.Buffer
    65  }
    66  
    67  func hashWeights(values []uint16) string {
    68  	h := sha256.New()
    69  	for _, v := range values {
    70  		h.Write([]byte{byte(v >> 8), byte(v)})
    71  	}
    72  	return hex.EncodeToString(h.Sum(nil))
    73  }
    74  
    75  func (pg *EmbedPageGenerator) WritePage16(g *Generator, varname string, values []uint16) string {
    76  	hash := hashWeights(values)
    77  	if existing, ok := pg.index[hash]; ok {
    78  		return "&" + existing
    79  	}
    80  
    81  	pg.index[hash] = varname
    82  
    83  	g.P("var ", varname, " = weightsUCA_embed(", pg.raw.Len()/2, ", ", len(values), ")")
    84  
    85  	for _, v := range values {
    86  		pg.raw.WriteByte(byte(v))
    87  		pg.raw.WriteByte(byte(v >> 8))
    88  	}
    89  	return "&" + varname
    90  }
    91  
    92  func (pg *EmbedPageGenerator) WriteTrailer(g *Generator, embedfile string) {
    93  	unsafe := Package("unsafe")
    94  	reflect := Package("reflect")
    95  	g.UsePackage("embed")
    96  
    97  	g.P()
    98  	g.P("//go:embed ", embedfile)
    99  	g.P("var weightsUCA_embed_data string")
   100  	g.P()
   101  	g.P("func weightsUCA_embed(pos, length int) []uint16 {")
   102  	g.P("return (*[0x7fff0000]uint16)(", unsafe, ".Pointer((*", reflect, ".StringHeader)(", unsafe, ".Pointer(&weightsUCA_embed_data)).Data))[pos:pos+length]")
   103  	g.P("}")
   104  }
   105  
   106  func (pg *EmbedPageGenerator) WriteToFile(out string) {
   107  	if err := os.WriteFile(out, pg.raw.Bytes(), 0644); err != nil {
   108  		log.Fatal(err)
   109  	}
   110  	log.Printf("written %q (%.02fkb)", out, float64(pg.raw.Len())/1024.0)
   111  }
   112  
   113  type PageGenerator interface {
   114  	WritePage16(g *Generator, varname string, values []uint16) string
   115  }
   116  
   117  func NewPageGenerator(embed bool) PageGenerator {
   118  	index := make(map[string]string)
   119  	if embed {
   120  		return &EmbedPageGenerator{index: index}
   121  	}
   122  	return &LiteralPageGenerator{index: index}
   123  }
   124  
   125  type entry struct {
   126  	weights []uint16
   127  }
   128  
   129  func (e *entry) adjustHangulWeights(tb *TableGenerator, jamos []rune) {
   130  	for _, jamo := range jamos {
   131  		_, entry := tb.entryForCodepoint(jamo)
   132  		e.weights = append(e.weights, entry.weights[0], entry.weights[1], entry.weights[2]+1)
   133  	}
   134  }
   135  
   136  type page struct {
   137  	n          int
   138  	entryCount int
   139  	entries    [uca.CodepointsPerPage]entry
   140  }
   141  
   142  func (p *page) equals(other *page) bool {
   143  	return reflect.DeepEqual(p, other)
   144  }
   145  
   146  func (p *page) name(uca string) string {
   147  	if p.entryCount == 0 {
   148  		panic("cannot name empty page")
   149  	}
   150  	return fmt.Sprintf("weightTable_%s_page%03X", uca, p.n)
   151  }
   152  
   153  func (p *page) findMaxCollationElements() int {
   154  	var weightn int
   155  	for _, entry := range p.entries {
   156  		if len(entry.weights) > weightn {
   157  			weightn = len(entry.weights)
   158  		}
   159  	}
   160  	return weightn
   161  }
   162  
   163  func (p *page) weights900Fast(level int) (w []uint32) {
   164  	if p.entryCount == 0 {
   165  		return nil
   166  	}
   167  	for i := 0; i < 128; i++ {
   168  		entry := &p.entries[i]
   169  		if len(entry.weights) > 3 {
   170  			panic("trying to dump fast weights for codepoint with >3 weights")
   171  		}
   172  		var weight uint32
   173  		if level < len(entry.weights) {
   174  			weight = uint32(bits.ReverseBytes16(entry.weights[level]))
   175  		}
   176  		if weight != 0 {
   177  			weight |= 0x20000
   178  		}
   179  		w = append(w, weight)
   180  	}
   181  	for i := 0; i < 128; i++ {
   182  		w = append(w, 0x0)
   183  	}
   184  	return
   185  }
   186  
   187  func (p *page) weights900() (w []uint16) {
   188  	if p.entryCount == 0 {
   189  		return nil
   190  	}
   191  	maxCollations := p.findMaxCollationElements()
   192  	for _, entry := range p.entries {
   193  		w = append(w, uint16(len(entry.weights)/3))
   194  	}
   195  	for level := 0; level < maxCollations; level++ {
   196  		for _, entry := range p.entries {
   197  			var weight uint16
   198  			if level < len(entry.weights) {
   199  				weight = entry.weights[level]
   200  			}
   201  			w = append(w, weight)
   202  		}
   203  	}
   204  	return
   205  }
   206  
   207  func (p *page) weightsLegacy() (w []uint16) {
   208  	if p.entryCount == 0 {
   209  		return nil
   210  	}
   211  	stride := p.findMaxCollationElements()
   212  	w = append(w, uint16(stride))
   213  	for _, entry := range p.entries {
   214  		var i int
   215  		for i < len(entry.weights) {
   216  			w = append(w, entry.weights[i])
   217  			i++
   218  		}
   219  		for i < stride {
   220  			w = append(w, 0x0)
   221  			i++
   222  		}
   223  	}
   224  	return
   225  }
   226  
   227  type TableGenerator struct {
   228  	pages   []page
   229  	maxChar rune
   230  	ucav    string
   231  	pg      PageGenerator
   232  }
   233  
   234  func (tg *TableGenerator) entryForCodepoint(codepoint rune) (*page, *entry) {
   235  	page := &tg.pages[int(codepoint)/uca.CodepointsPerPage]
   236  	entry := &page.entries[int(codepoint)%uca.CodepointsPerPage]
   237  	return page, entry
   238  }
   239  
   240  func (tg *TableGenerator) Add900(codepoint rune, rhs [][3]uint16) {
   241  	page, entry := tg.entryForCodepoint(codepoint)
   242  	page.entryCount++
   243  
   244  	for i, weights := range rhs {
   245  		if i >= uca.MaxCollationElementsPerCodepoint {
   246  			break
   247  		}
   248  		for _, we := range weights {
   249  			entry.weights = append(entry.weights, we)
   250  		}
   251  	}
   252  }
   253  
   254  func (tg *TableGenerator) Add(codepoint rune, weights []uint16) {
   255  	page, entry := tg.entryForCodepoint(codepoint)
   256  	page.entryCount++
   257  
   258  	if entry.weights != nil {
   259  		panic("duplicate codepoint inserted")
   260  	}
   261  	entry.weights = append(entry.weights, weights...)
   262  }
   263  
   264  func (tg *TableGenerator) AddFromAllkeys(lhs []rune, rhs [][]int, vars []int) {
   265  	if len(lhs) > 1 || lhs[0] > tg.maxChar {
   266  		// TODO: support contractions
   267  		return
   268  	}
   269  
   270  	var weights [][3]uint16
   271  	for _, we := range rhs {
   272  		if len(we) != 3 {
   273  			panic("non-triplet weight in allkeys.txt")
   274  		}
   275  		weights = append(weights, [3]uint16{uint16(we[0]), uint16(we[1]), uint16(we[2])})
   276  	}
   277  	tg.Add900(lhs[0], weights)
   278  }
   279  
   280  func (tg *TableGenerator) writePage(g *Generator, p *page, layout uca.Layout) string {
   281  	var weights []uint16
   282  
   283  	switch layout.(type) {
   284  	case uca.Layout_uca900:
   285  		weights = p.weights900()
   286  	case uca.Layout_uca_legacy:
   287  		weights = p.weightsLegacy()
   288  	}
   289  
   290  	if len(weights) == 0 {
   291  		return "nil"
   292  	}
   293  	return tg.pg.WritePage16(g, p.name(tg.ucav), weights)
   294  }
   295  
   296  func (tg *TableGenerator) WriteTables(g *Generator, layout uca.Layout) {
   297  	var pagePtrs []string
   298  	for _, page := range tg.pages {
   299  		pagePtrs = append(pagePtrs, tg.writePage(g, &page, layout))
   300  	}
   301  
   302  	g.P("var weightTable_", tg.ucav, " = []*[]uint16{")
   303  	for col, pageptr := range pagePtrs {
   304  		if col > 0 && col%32 == 0 {
   305  			g.WriteByte('\n')
   306  		}
   307  		g.WriteString(pageptr)
   308  		g.WriteByte(',')
   309  	}
   310  	g.P("}")
   311  }
   312  
   313  func (tg *TableGenerator) WriteFastTables(g *Generator, layout uca.Layout) {
   314  	switch layout.(type) {
   315  	case uca.Layout_uca900:
   316  	default:
   317  		panic("unsupported table layout for FastTables")
   318  	}
   319  
   320  	ascii := &tg.pages[0]
   321  	WriteFastPage32(g, ascii.name(tg.ucav)+"L0", ascii.weights900Fast(0))
   322  	WriteFastPage32(g, ascii.name(tg.ucav)+"L1", ascii.weights900Fast(1))
   323  	WriteFastPage32(g, ascii.name(tg.ucav)+"L2", ascii.weights900Fast(2))
   324  }
   325  
   326  func NewTableGenerator(ucav string, pagebuilder PageGenerator) *TableGenerator {
   327  	var maxChar rune
   328  	switch ucav {
   329  	case "uca520", "uca900", "uca900_zh", "uca900_ja":
   330  		maxChar = uca.MaxCodepoint
   331  	case "uca400":
   332  		maxChar = 0xFFFF + 1
   333  	default:
   334  		panic("unknown UCA version")
   335  	}
   336  
   337  	tb := &TableGenerator{
   338  		pages:   make([]page, maxChar/uca.CodepointsPerPage),
   339  		maxChar: maxChar,
   340  		ucav:    ucav,
   341  		pg:      pagebuilder,
   342  	}
   343  
   344  	for n := range tb.pages {
   345  		tb.pages[n].n = n
   346  	}
   347  
   348  	return tb
   349  }