golang.org/x/text@v0.14.0/collate/tools/colcmp/gen.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package main 6 7 import ( 8 "math" 9 "math/rand" 10 "strings" 11 "unicode" 12 "unicode/utf16" 13 "unicode/utf8" 14 15 "golang.org/x/text/language" 16 "golang.org/x/text/unicode/norm" 17 ) 18 19 // TODO: replace with functionality in language package. 20 // parent computes the parent language for the given language. 21 // It returns false if the parent is already root. 22 func parent(locale string) (parent string, ok bool) { 23 if locale == "und" { 24 return "", false 25 } 26 if i := strings.LastIndex(locale, "-"); i != -1 { 27 return locale[:i], true 28 } 29 return "und", true 30 } 31 32 // rewriter is used to both unique strings and create variants of strings 33 // to add to the test set. 34 type rewriter struct { 35 seen map[string]bool 36 addCases bool 37 } 38 39 func newRewriter() *rewriter { 40 return &rewriter{ 41 seen: make(map[string]bool), 42 } 43 } 44 45 func (r *rewriter) insert(a []string, s string) []string { 46 if !r.seen[s] { 47 r.seen[s] = true 48 a = append(a, s) 49 } 50 return a 51 } 52 53 // rewrite takes a sequence of strings in, adds variants of the these strings 54 // based on options and removes duplicates. 55 func (r *rewriter) rewrite(ss []string) []string { 56 ns := []string{} 57 for _, s := range ss { 58 ns = r.insert(ns, s) 59 if r.addCases { 60 rs := []rune(s) 61 rn := rs[0] 62 for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) { 63 rs[0] = c 64 ns = r.insert(ns, string(rs)) 65 } 66 } 67 } 68 return ns 69 } 70 71 // exemplarySet holds a parsed set of characters from the exemplarCharacters table. 72 type exemplarySet struct { 73 typ exemplarType 74 set []string 75 charIndex int // cumulative total of phrases, including this set 76 } 77 78 type phraseGenerator struct { 79 sets [exN]exemplarySet 80 n int 81 } 82 83 func (g *phraseGenerator) init(id string) { 84 ec := exemplarCharacters 85 loc := language.Make(id).String() 86 // get sets for locale or parent locale if the set is not defined. 87 for i := range g.sets { 88 for p, ok := loc, true; ok; p, ok = parent(p) { 89 if set, ok := ec[p]; ok && set[i] != "" { 90 g.sets[i].set = strings.Split(set[i], " ") 91 break 92 } 93 } 94 } 95 r := newRewriter() 96 r.addCases = *cases 97 for i := range g.sets { 98 g.sets[i].set = r.rewrite(g.sets[i].set) 99 } 100 // compute indexes 101 for i, set := range g.sets { 102 g.n += len(set.set) 103 g.sets[i].charIndex = g.n 104 } 105 } 106 107 // phrase returns the ith phrase, where i < g.n. 108 func (g *phraseGenerator) phrase(i int) string { 109 for _, set := range g.sets { 110 if i < set.charIndex { 111 return set.set[i-(set.charIndex-len(set.set))] 112 } 113 } 114 panic("index out of range") 115 } 116 117 // generate generates inputs by combining all pairs of examplar strings. 118 // If doNorm is true, all input strings are normalized to NFC. 119 // TODO: allow other variations, statistical models, and random 120 // trailing sequences. 121 func (g *phraseGenerator) generate(doNorm bool) []Input { 122 const ( 123 M = 1024 * 1024 124 buf8Size = 30 * M 125 buf16Size = 10 * M 126 ) 127 // TODO: use a better way to limit the input size. 128 if sq := int(math.Sqrt(float64(*limit))); g.n > sq { 129 g.n = sq 130 } 131 size := g.n * g.n 132 a := make([]Input, 0, size) 133 buf8 := make([]byte, 0, buf8Size) 134 buf16 := make([]uint16, 0, buf16Size) 135 136 addInput := func(str string) { 137 buf8 = buf8[len(buf8):] 138 buf16 = buf16[len(buf16):] 139 if len(str) > cap(buf8) { 140 buf8 = make([]byte, 0, buf8Size) 141 } 142 if len(str) > cap(buf16) { 143 buf16 = make([]uint16, 0, buf16Size) 144 } 145 if doNorm { 146 buf8 = norm.NFD.AppendString(buf8, str) 147 } else { 148 buf8 = append(buf8, str...) 149 } 150 buf16 = appendUTF16(buf16, buf8) 151 a = append(a, makeInput(buf8, buf16)) 152 } 153 for i := 0; i < g.n; i++ { 154 p1 := g.phrase(i) 155 addInput(p1) 156 for j := 0; j < g.n; j++ { 157 p2 := g.phrase(j) 158 addInput(p1 + p2) 159 } 160 } 161 // permutate 162 rnd := rand.New(rand.NewSource(int64(rand.Int()))) 163 for i := range a { 164 j := i + rnd.Intn(len(a)-i) 165 a[i], a[j] = a[j], a[i] 166 a[i].index = i // allow restoring this order if input is used multiple times. 167 } 168 return a 169 } 170 171 func appendUTF16(buf []uint16, s []byte) []uint16 { 172 for len(s) > 0 { 173 r, sz := utf8.DecodeRune(s) 174 s = s[sz:] 175 r1, r2 := utf16.EncodeRune(r) 176 if r1 != 0xFFFD { 177 buf = append(buf, uint16(r1), uint16(r2)) 178 } else { 179 buf = append(buf, uint16(r)) 180 } 181 } 182 return buf 183 }