github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/export/idna/gen.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  // This program generates the trie for idna operations. The Unicode casing
     8  // algorithm requires the lookup of various properties and mappings for each
     9  // rune. The table generated by this generator combines several of the most
    10  // frequently used of these into a single trie so that they can be accessed
    11  // with a single lookup.
    12  package main
    13  
    14  import (
    15  	"fmt"
    16  	"io"
    17  	"log"
    18  	"unicode"
    19  	"unicode/utf8"
    20  
    21  	"github.com/go-xe2/third/golang.org/x/text/internal/gen"
    22  	"github.com/go-xe2/third/golang.org/x/text/internal/triegen"
    23  	"github.com/go-xe2/third/golang.org/x/text/internal/ucd"
    24  	"github.com/go-xe2/third/golang.org/x/text/unicode/bidi"
    25  )
    26  
    27  func main() {
    28  	gen.Init()
    29  	genTables()
    30  	gen.Repackage("gen_trieval.go", "trieval.go", "idna")
    31  	gen.Repackage("gen_common.go", "common_test.go", "idna")
    32  }
    33  
    34  var runes = map[rune]info{}
    35  
    36  func genTables() {
    37  	t := triegen.NewTrie("idna")
    38  
    39  	ucd.Parse(gen.OpenUCDFile("DerivedNormalizationProps.txt"), func(p *ucd.Parser) {
    40  		r := p.Rune(0)
    41  		if p.String(1) == "NFC_QC" { // p.String(2) is "N" or "M"
    42  			runes[r] = mayNeedNorm
    43  		}
    44  	})
    45  	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
    46  		r := p.Rune(0)
    47  
    48  		const cccVirama = 9
    49  		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
    50  			runes[p.Rune(0)] = viramaModifier
    51  		}
    52  		switch {
    53  		case unicode.In(r, unicode.Mark):
    54  			runes[r] |= modifier | mayNeedNorm
    55  		}
    56  		// TODO: by using UnicodeData.txt we don't mark undefined codepoints
    57  		// that are earmarked as RTL properly. However, an undefined cp will
    58  		// always fail, so there is no need to store this info.
    59  		switch p, _ := bidi.LookupRune(r); p.Class() {
    60  		case bidi.R, bidi.AL, bidi.AN:
    61  			if x := runes[r]; x != 0 && x != mayNeedNorm {
    62  				log.Fatalf("%U: rune both modifier and RTL letter/number", r)
    63  			}
    64  			runes[r] = rtl
    65  		}
    66  	})
    67  
    68  	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
    69  		switch v := p.String(1); v {
    70  		case "L", "D", "T", "R":
    71  			runes[p.Rune(0)] |= joinType[v] << joinShift
    72  		}
    73  	})
    74  
    75  	ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
    76  		r := p.Rune(0)
    77  
    78  		// The mappings table explicitly defines surrogates as invalid.
    79  		if !utf8.ValidRune(r) {
    80  			return
    81  		}
    82  
    83  		cat := catFromEntry(p)
    84  		isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation
    85  		if !isMapped {
    86  			// Only include additional category information for non-mapped
    87  			// runes. The additional information is only used after mapping and
    88  			// the bits would clash with mapping information.
    89  			// TODO: it would be possible to inline this data and avoid
    90  			// additional lookups. This is quite tedious, though, so let's first
    91  			// see if we need this.
    92  			cat |= category(runes[r])
    93  		}
    94  
    95  		s := string(p.Runes(2))
    96  		if s != "" && !isMapped {
    97  			log.Fatalf("%U: Mapping with non-mapping category %d", r, cat)
    98  		}
    99  		t.Insert(r, uint64(makeEntry(r, s))+uint64(cat))
   100  	})
   101  
   102  	w := gen.NewCodeWriter()
   103  	defer w.WriteGoFile("tables.go", "idna")
   104  
   105  	gen.WriteUnicodeVersion(w)
   106  
   107  	w.WriteVar("mappings", string(mappings))
   108  	w.WriteVar("xorData", string(xorData))
   109  
   110  	sz, err := t.Gen(w, triegen.Compact(&normCompacter{}))
   111  	if err != nil {
   112  		log.Fatal(err)
   113  	}
   114  	w.Size += sz
   115  }
   116  
   117  var (
   118  	// mappings contains replacement strings for mapped runes, each prefixed
   119  	// with a byte containing the length of the following string.
   120  	mappings = []byte{}
   121  	mapCache = map[string]int{}
   122  
   123  	// xorData is like mappings, except that it contains XOR data.
   124  	// We split these two tables so that we don't get an overflow.
   125  	xorData  = []byte{}
   126  	xorCache = map[string]int{}
   127  )
   128  
   129  // makeEntry creates a trie entry.
   130  func makeEntry(r rune, mapped string) info {
   131  	orig := string(r)
   132  
   133  	if len(orig) != len(mapped) {
   134  		// Store the mapped value as is in the mappings table.
   135  		index := len(mappings)
   136  		if x, ok := mapCache[mapped]; ok {
   137  			index = x
   138  		} else {
   139  			mapCache[mapped] = index
   140  			mappings = append(mappings, byte(len(mapped)))
   141  			mappings = append(mappings, mapped...)
   142  		}
   143  		return info(index) << indexShift
   144  	}
   145  
   146  	// Create per-byte XOR mask.
   147  	var b []byte
   148  	for i := 0; i < len(orig); i++ {
   149  		b = append(b, orig[i]^mapped[i])
   150  	}
   151  
   152  	// Remove leading 0 bytes, but keep at least one byte.
   153  	for ; len(b) > 1 && b[0] == 0; b = b[1:] {
   154  	}
   155  
   156  	if len(b) == 1 {
   157  		return xorBit | inlineXOR | info(b[0])<<indexShift
   158  	}
   159  	mapped = string(b)
   160  
   161  	// Store the mapped value as is in the mappings table.
   162  	index := len(xorData)
   163  	if x, ok := xorCache[mapped]; ok {
   164  		index = x
   165  	} else {
   166  		xorCache[mapped] = index
   167  		xorData = append(xorData, byte(len(mapped)))
   168  		xorData = append(xorData, mapped...)
   169  	}
   170  	return xorBit | info(index)<<indexShift
   171  }
   172  
   173  // The following code implements a triegen.Compacter that was originally
   174  // designed for normalization. The IDNA table has some similarities with the
   175  // norm table. Using this compacter, together with the XOR pattern approach,
   176  // reduces the table size by roughly 100K. It can probably be compressed further
   177  // by also including elements of the compacter used by cases, but for now it is
   178  // good enough.
   179  
   180  const maxSparseEntries = 16
   181  
   182  type normCompacter struct {
   183  	sparseBlocks [][]uint64
   184  	sparseOffset []uint16
   185  	sparseCount  int
   186  }
   187  
   188  func mostFrequentStride(a []uint64) int {
   189  	counts := make(map[int]int)
   190  	var v int
   191  	for _, x := range a {
   192  		if stride := int(x) - v; v != 0 && stride >= 0 {
   193  			counts[stride]++
   194  		}
   195  		v = int(x)
   196  	}
   197  	var maxs, maxc int
   198  	for stride, cnt := range counts {
   199  		if cnt > maxc || (cnt == maxc && stride < maxs) {
   200  			maxs, maxc = stride, cnt
   201  		}
   202  	}
   203  	return maxs
   204  }
   205  
   206  func countSparseEntries(a []uint64) int {
   207  	stride := mostFrequentStride(a)
   208  	var v, count int
   209  	for _, tv := range a {
   210  		if int(tv)-v != stride {
   211  			if tv != 0 {
   212  				count++
   213  			}
   214  		}
   215  		v = int(tv)
   216  	}
   217  	return count
   218  }
   219  
   220  func (c *normCompacter) Size(v []uint64) (sz int, ok bool) {
   221  	if n := countSparseEntries(v); n <= maxSparseEntries {
   222  		return (n+1)*4 + 2, true
   223  	}
   224  	return 0, false
   225  }
   226  
   227  func (c *normCompacter) Store(v []uint64) uint32 {
   228  	h := uint32(len(c.sparseOffset))
   229  	c.sparseBlocks = append(c.sparseBlocks, v)
   230  	c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount))
   231  	c.sparseCount += countSparseEntries(v) + 1
   232  	return h
   233  }
   234  
   235  func (c *normCompacter) Handler() string {
   236  	return "idnaSparse.lookup"
   237  }
   238  
   239  func (c *normCompacter) Print(w io.Writer) (retErr error) {
   240  	p := func(f string, x ...interface{}) {
   241  		if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil {
   242  			retErr = err
   243  		}
   244  	}
   245  
   246  	ls := len(c.sparseBlocks)
   247  	p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2)
   248  	p("var idnaSparseOffset = %#v\n\n", c.sparseOffset)
   249  
   250  	ns := c.sparseCount
   251  	p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4)
   252  	p("var idnaSparseValues = [%d]valueRange {", ns)
   253  	for i, b := range c.sparseBlocks {
   254  		p("\n// Block %#x, offset %#x", i, c.sparseOffset[i])
   255  		var v int
   256  		stride := mostFrequentStride(b)
   257  		n := countSparseEntries(b)
   258  		p("\n{value:%#04x,lo:%#02x},", stride, uint8(n))
   259  		for i, nv := range b {
   260  			if int(nv)-v != stride {
   261  				if v != 0 {
   262  					p(",hi:%#02x},", 0x80+i-1)
   263  				}
   264  				if nv != 0 {
   265  					p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
   266  				}
   267  			}
   268  			v = int(nv)
   269  		}
   270  		if v != 0 {
   271  			p(",hi:%#02x},", 0x80+len(b)-1)
   272  		}
   273  	}
   274  	p("\n}\n\n")
   275  	return
   276  }