github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/internal/export/idna/gen.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  // +build ignore
     7  
     8  // This program generates the trie for idna operations. The Unicode casing
     9  // algorithm requires the lookup of various properties and mappings for each
    10  // rune. The table generated by this generator combines several of the most
    11  // frequently used of these into a single trie so that they can be accessed
    12  // with a single lookup.
    13  package main
    14  
    15  import (
    16  	"fmt"
    17  	"io"
    18  	"log"
    19  	"unicode"
    20  	"unicode/utf8"
    21  
    22  	"github.com/go-enjin/golang-org-x-text/internal/gen"
    23  	"github.com/go-enjin/golang-org-x-text/internal/triegen"
    24  	"github.com/go-enjin/golang-org-x-text/internal/ucd"
    25  	"github.com/go-enjin/golang-org-x-text/unicode/bidi"
    26  )
    27  
    28  func main() {
    29  	gen.Init()
    30  	genTables()
    31  	gen.Repackage("gen_trieval.go", "trieval.go", "idna")
    32  	gen.Repackage("gen_common.go", "common_test.go", "idna")
    33  }
    34  
    35  var runes = map[rune]info{}
    36  
    37  func genTables() {
    38  	t := triegen.NewTrie("idna")
    39  
    40  	ucd.Parse(gen.OpenUCDFile("DerivedNormalizationProps.txt"), func(p *ucd.Parser) {
    41  		r := p.Rune(0)
    42  		if p.String(1) == "NFC_QC" { // p.String(2) is "N" or "M"
    43  			runes[r] = mayNeedNorm
    44  		}
    45  	})
    46  	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
    47  		r := p.Rune(0)
    48  
    49  		const cccVirama = 9
    50  		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
    51  			runes[p.Rune(0)] = viramaModifier
    52  		}
    53  		switch {
    54  		case unicode.In(r, unicode.Mark):
    55  			runes[r] |= modifier | mayNeedNorm
    56  		}
    57  		// TODO: by using UnicodeData.txt we don't mark undefined codepoints
    58  		// that are earmarked as RTL properly. However, an undefined cp will
    59  		// always fail, so there is no need to store this info.
    60  		switch p, _ := bidi.LookupRune(r); p.Class() {
    61  		case bidi.R, bidi.AL, bidi.AN:
    62  			if x := runes[r]; x != 0 && x != mayNeedNorm {
    63  				log.Fatalf("%U: rune both modifier and RTL letter/number", r)
    64  			}
    65  			runes[r] = rtl
    66  		}
    67  	})
    68  
    69  	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
    70  		switch v := p.String(1); v {
    71  		case "L", "D", "T", "R":
    72  			runes[p.Rune(0)] |= joinType[v] << joinShift
    73  		}
    74  	})
    75  
    76  	ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
    77  		r := p.Rune(0)
    78  
    79  		// The mappings table explicitly defines surrogates as invalid.
    80  		if !utf8.ValidRune(r) {
    81  			return
    82  		}
    83  
    84  		cat := catFromEntry(p)
    85  		isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation
    86  		if !isMapped {
    87  			// Only include additional category information for non-mapped
    88  			// runes. The additional information is only used after mapping and
    89  			// the bits would clash with mapping information.
    90  			// TODO: it would be possible to inline this data and avoid
    91  			// additional lookups. This is quite tedious, though, so let's first
    92  			// see if we need this.
    93  			cat |= category(runes[r])
    94  		}
    95  
    96  		s := string(p.Runes(2))
    97  		if s != "" && !isMapped {
    98  			log.Fatalf("%U: Mapping with non-mapping category %d", r, cat)
    99  		}
   100  		t.Insert(r, uint64(makeEntry(r, s))+uint64(cat))
   101  	})
   102  
   103  	w := gen.NewCodeWriter()
   104  	defer w.WriteVersionedGoFile("tables.go", "idna")
   105  
   106  	gen.WriteUnicodeVersion(w)
   107  
   108  	w.WriteVar("mappings", string(mappings))
   109  	w.WriteVar("mappingIndex", mappingIndex)
   110  	w.WriteVar("xorData", string(xorData))
   111  
   112  	sz, err := t.Gen(w, triegen.Compact(&normCompacter{}))
   113  	if err != nil {
   114  		log.Fatal(err)
   115  	}
   116  	w.Size += sz
   117  }
   118  
   119  var (
   120  	// mappings contains replacement strings for mapped runes.
   121  	mappings = []byte{}
   122  
   123  	// mappingIndex contains an offset in mappingBytes representing the start
   124  	// of a mapping. Then next entry in mappingIndex points past the end of the
   125  	// string.
   126  	mappingIndex = []uint16{0}
   127  	mapCache     = map[string]int{}
   128  
   129  	// xorData is like mappings, except that it contains XOR data.
   130  	// We split these two tables so that we don't get an overflow.
   131  	xorData  = []byte{}
   132  	xorCache = map[string]int{}
   133  )
   134  
   135  // makeEntry creates a trie entry.
   136  func makeEntry(r rune, mapped string) info {
   137  	orig := string(r)
   138  
   139  	if len(orig) != len(mapped) {
   140  		// Store the mapped value as is in the mappings table.
   141  		index := len(mappingIndex) - 1
   142  		if x, ok := mapCache[mapped]; ok {
   143  			index = x
   144  		} else {
   145  			mapCache[mapped] = index
   146  			mappings = append(mappings, mapped...)
   147  			mappingIndex = append(mappingIndex, uint16(len(mappings)))
   148  		}
   149  		return info(index) << indexShift
   150  	}
   151  
   152  	// Create per-byte XOR mask.
   153  	var b []byte
   154  	for i := 0; i < len(orig); i++ {
   155  		b = append(b, orig[i]^mapped[i])
   156  	}
   157  
   158  	// Remove leading 0 bytes, but keep at least one byte.
   159  	for ; len(b) > 1 && b[0] == 0; b = b[1:] {
   160  	}
   161  
   162  	if len(b) == 1 {
   163  		return xorBit | inlineXOR | info(b[0])<<indexShift
   164  	}
   165  	mapped = string(b)
   166  
   167  	// Store the mapped value as is in the mappings table.
   168  	index := len(xorData)
   169  	if x, ok := xorCache[mapped]; ok {
   170  		index = x
   171  	} else {
   172  		xorCache[mapped] = index
   173  		xorData = append(xorData, byte(len(mapped)))
   174  		xorData = append(xorData, mapped...)
   175  	}
   176  	return xorBit | info(index)<<indexShift
   177  }
   178  
   179  // The following code implements a triegen.Compacter that was originally
   180  // designed for normalization. The IDNA table has some similarities with the
   181  // norm table. Using this compacter, together with the XOR pattern approach,
   182  // reduces the table size by roughly 100K. It can probably be compressed further
   183  // by also including elements of the compacter used by cases, but for now it is
   184  // good enough.
   185  
   186  const maxSparseEntries = 16
   187  
   188  type normCompacter struct {
   189  	sparseBlocks [][]uint64
   190  	sparseOffset []uint16
   191  	sparseCount  int
   192  }
   193  
   194  func mostFrequentStride(a []uint64) int {
   195  	counts := make(map[int]int)
   196  	var v int
   197  	for _, x := range a {
   198  		if stride := int(x) - v; v != 0 && stride >= 0 {
   199  			counts[stride]++
   200  		}
   201  		v = int(x)
   202  	}
   203  	var maxs, maxc int
   204  	for stride, cnt := range counts {
   205  		if cnt > maxc || (cnt == maxc && stride < maxs) {
   206  			maxs, maxc = stride, cnt
   207  		}
   208  	}
   209  	return maxs
   210  }
   211  
   212  func countSparseEntries(a []uint64) int {
   213  	stride := mostFrequentStride(a)
   214  	var v, count int
   215  	for _, tv := range a {
   216  		if int(tv)-v != stride {
   217  			if tv != 0 {
   218  				count++
   219  			}
   220  		}
   221  		v = int(tv)
   222  	}
   223  	return count
   224  }
   225  
   226  func (c *normCompacter) Size(v []uint64) (sz int, ok bool) {
   227  	if n := countSparseEntries(v); n <= maxSparseEntries {
   228  		return (n+1)*4 + 2, true
   229  	}
   230  	return 0, false
   231  }
   232  
   233  func (c *normCompacter) Store(v []uint64) uint32 {
   234  	h := uint32(len(c.sparseOffset))
   235  	c.sparseBlocks = append(c.sparseBlocks, v)
   236  	c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount))
   237  	c.sparseCount += countSparseEntries(v) + 1
   238  	return h
   239  }
   240  
   241  func (c *normCompacter) Handler() string {
   242  	return "idnaSparse.lookup"
   243  }
   244  
   245  func (c *normCompacter) Print(w io.Writer) (retErr error) {
   246  	p := func(f string, x ...interface{}) {
   247  		if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil {
   248  			retErr = err
   249  		}
   250  	}
   251  
   252  	ls := len(c.sparseBlocks)
   253  	p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2)
   254  	p("var idnaSparseOffset = %#v\n\n", c.sparseOffset)
   255  
   256  	ns := c.sparseCount
   257  	p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4)
   258  	p("var idnaSparseValues = [%d]valueRange {", ns)
   259  	for i, b := range c.sparseBlocks {
   260  		p("\n// Block %#x, offset %#x", i, c.sparseOffset[i])
   261  		var v int
   262  		stride := mostFrequentStride(b)
   263  		n := countSparseEntries(b)
   264  		p("\n{value:%#04x,lo:%#02x},", stride, uint8(n))
   265  		for i, nv := range b {
   266  			if int(nv)-v != stride {
   267  				if v != 0 {
   268  					p(",hi:%#02x},", 0x80+i-1)
   269  				}
   270  				if nv != 0 {
   271  					p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
   272  				}
   273  			}
   274  			v = int(nv)
   275  		}
   276  		if v != 0 {
   277  			p(",hi:%#02x},", 0x80+len(b)-1)
   278  		}
   279  	}
   280  	p("\n}\n\n")
   281  	return
   282  }