github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/triegen/print.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package triegen 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "strings" 12 "text/template" 13 ) 14 15 // print writes all the data structures as well as the code necessary to use the 16 // trie to w. 17 func (b *builder) print(w io.Writer) error { 18 b.Stats.NValueEntries = len(b.ValueBlocks) * blockSize 19 b.Stats.NValueBytes = len(b.ValueBlocks) * blockSize * b.ValueSize 20 b.Stats.NIndexEntries = len(b.IndexBlocks) * blockSize 21 b.Stats.NIndexBytes = len(b.IndexBlocks) * blockSize * b.IndexSize 22 b.Stats.NHandleBytes = len(b.Trie) * 2 * b.IndexSize 23 24 // If we only have one root trie, all starter blocks are at position 0 and 25 // we can access the arrays directly. 26 if len(b.Trie) == 1 { 27 // At this point we cannot refer to the generated tables directly. 28 b.ASCIIBlock = b.Name + "Values" 29 b.StarterBlock = b.Name + "Index" 30 } else { 31 // Otherwise we need to have explicit starter indexes in the trie 32 // structure. 33 b.ASCIIBlock = "t.ascii" 34 b.StarterBlock = "t.utf8Start" 35 } 36 37 b.SourceType = "[]byte" 38 if err := lookupGen.Execute(w, b); err != nil { 39 return err 40 } 41 42 b.SourceType = "string" 43 if err := lookupGen.Execute(w, b); err != nil { 44 return err 45 } 46 47 if err := trieGen.Execute(w, b); err != nil { 48 return err 49 } 50 51 for _, c := range b.Compactions { 52 if err := c.c.Print(w); err != nil { 53 return err 54 } 55 } 56 57 return nil 58 } 59 60 func printValues(n int, values []uint64) string { 61 w := &bytes.Buffer{} 62 boff := n * blockSize 63 fmt.Fprintf(w, "\t// Block %#x, offset %#x", n, boff) 64 var newline bool 65 for i, v := range values { 66 if i%6 == 0 { 67 newline = true 68 } 69 if v != 0 { 70 if newline { 71 fmt.Fprintf(w, "\n") 72 newline = false 73 } 74 fmt.Fprintf(w, "\t%#02x:%#04x, ", boff+i, v) 75 } 76 } 77 return w.String() 78 } 79 80 func printIndex(b *builder, nr int, n *node) string { 81 w := &bytes.Buffer{} 82 boff := nr * blockSize 83 fmt.Fprintf(w, "\t// Block %#x, offset %#x", nr, boff) 84 var newline bool 85 for i, c := range n.children { 86 if i%8 == 0 { 87 newline = true 88 } 89 if c != nil { 90 v := b.Compactions[c.index.compaction].Offset + uint32(c.index.index) 91 if v != 0 { 92 if newline { 93 fmt.Fprintf(w, "\n") 94 newline = false 95 } 96 fmt.Fprintf(w, "\t%#02x:%#02x, ", boff+i, v) 97 } 98 } 99 } 100 return w.String() 101 } 102 103 var ( 104 trieGen = template.Must(template.New("trie").Funcs(template.FuncMap{ 105 "printValues": printValues, 106 "printIndex": printIndex, 107 "title": strings.Title, 108 "dec": func(x int) int { return x - 1 }, 109 "psize": func(n int) string { 110 return fmt.Sprintf("%d bytes (%.2f KiB)", n, float64(n)/1024) 111 }, 112 }).Parse(trieTemplate)) 113 lookupGen = template.Must(template.New("lookup").Parse(lookupTemplate)) 114 ) 115 116 // TODO: consider the return type of lookup. It could be uint64, even if the 117 // internal value type is smaller. We will have to verify this with the 118 // performance of unicode/norm, which is very sensitive to such changes. 119 const trieTemplate = `{{$b := .}}{{$multi := gt (len .Trie) 1}} 120 // {{.Name}}Trie. Total size: {{psize .Size}}. Checksum: {{printf "%08x" .Checksum}}. 121 type {{.Name}}Trie struct { {{if $multi}} 122 ascii []{{.ValueType}} // index for ASCII bytes 123 utf8Start []{{.IndexType}} // index for UTF-8 bytes >= 0xC0 124 {{end}}} 125 126 func new{{title .Name}}Trie(i int) *{{.Name}}Trie { {{if $multi}} 127 h := {{.Name}}TrieHandles[i] 128 return &{{.Name}}Trie{ {{.Name}}Values[uint32(h.ascii)<<6:], {{.Name}}Index[uint32(h.multi)<<6:] } 129 } 130 131 type {{.Name}}TrieHandle struct { 132 ascii, multi {{.IndexType}} 133 } 134 135 // {{.Name}}TrieHandles: {{len .Trie}} handles, {{.Stats.NHandleBytes}} bytes 136 var {{.Name}}TrieHandles = [{{len .Trie}}]{{.Name}}TrieHandle{ 137 {{range .Trie}} { {{.ASCIIIndex}}, {{.StarterIndex}} }, // {{printf "%08x" .Checksum}}: {{.Name}} 138 {{end}}}{{else}} 139 return &{{.Name}}Trie{} 140 } 141 {{end}} 142 // lookupValue determines the type of block n and looks up the value for b. 143 func (t *{{.Name}}Trie) lookupValue(n uint32, b byte) {{.ValueType}}{{$last := dec (len .Compactions)}} { 144 switch { {{range $i, $c := .Compactions}} 145 {{if eq $i $last}}default{{else}}case n < {{$c.Cutoff}}{{end}}:{{if ne $i 0}} 146 n -= {{$c.Offset}}{{end}} 147 return {{print $b.ValueType}}({{$c.Handler}}){{end}} 148 } 149 } 150 151 // {{.Name}}Values: {{len .ValueBlocks}} blocks, {{.Stats.NValueEntries}} entries, {{.Stats.NValueBytes}} bytes 152 // The third block is the zero block. 153 var {{.Name}}Values = [{{.Stats.NValueEntries}}]{{.ValueType}} { 154 {{range $i, $v := .ValueBlocks}}{{printValues $i $v}} 155 {{end}}} 156 157 // {{.Name}}Index: {{len .IndexBlocks}} blocks, {{.Stats.NIndexEntries}} entries, {{.Stats.NIndexBytes}} bytes 158 // Block 0 is the zero block. 159 var {{.Name}}Index = [{{.Stats.NIndexEntries}}]{{.IndexType}} { 160 {{range $i, $v := .IndexBlocks}}{{printIndex $b $i $v}} 161 {{end}}} 162 ` 163 164 // TODO: consider allowing zero-length strings after evaluating performance with 165 // unicode/norm. 166 const lookupTemplate = ` 167 // lookup{{if eq .SourceType "string"}}String{{end}} returns the trie value for the first UTF-8 encoding in s and 168 // the width in bytes of this encoding. The size will be 0 if s does not 169 // hold enough bytes to complete the encoding. len(s) must be greater than 0. 170 func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}(s {{.SourceType}}) (v {{.ValueType}}, sz int) { 171 c0 := s[0] 172 switch { 173 case c0 < 0x80: // is ASCII 174 return {{.ASCIIBlock}}[c0], 1 175 case c0 < 0xC2: 176 return 0, 1 // Illegal UTF-8: not a starter, not ASCII. 177 case c0 < 0xE0: // 2-byte UTF-8 178 if len(s) < 2 { 179 return 0, 0 180 } 181 i := {{.StarterBlock}}[c0] 182 c1 := s[1] 183 if c1 < 0x80 || 0xC0 <= c1 { 184 return 0, 1 // Illegal UTF-8: not a continuation byte. 185 } 186 return t.lookupValue(uint32(i), c1), 2 187 case c0 < 0xF0: // 3-byte UTF-8 188 if len(s) < 3 { 189 return 0, 0 190 } 191 i := {{.StarterBlock}}[c0] 192 c1 := s[1] 193 if c1 < 0x80 || 0xC0 <= c1 { 194 return 0, 1 // Illegal UTF-8: not a continuation byte. 195 } 196 o := uint32(i)<<6 + uint32(c1) 197 i = {{.Name}}Index[o] 198 c2 := s[2] 199 if c2 < 0x80 || 0xC0 <= c2 { 200 return 0, 2 // Illegal UTF-8: not a continuation byte. 201 } 202 return t.lookupValue(uint32(i), c2), 3 203 case c0 < 0xF8: // 4-byte UTF-8 204 if len(s) < 4 { 205 return 0, 0 206 } 207 i := {{.StarterBlock}}[c0] 208 c1 := s[1] 209 if c1 < 0x80 || 0xC0 <= c1 { 210 return 0, 1 // Illegal UTF-8: not a continuation byte. 211 } 212 o := uint32(i)<<6 + uint32(c1) 213 i = {{.Name}}Index[o] 214 c2 := s[2] 215 if c2 < 0x80 || 0xC0 <= c2 { 216 return 0, 2 // Illegal UTF-8: not a continuation byte. 217 } 218 o = uint32(i)<<6 + uint32(c2) 219 i = {{.Name}}Index[o] 220 c3 := s[3] 221 if c3 < 0x80 || 0xC0 <= c3 { 222 return 0, 3 // Illegal UTF-8: not a continuation byte. 223 } 224 return t.lookupValue(uint32(i), c3), 4 225 } 226 // Illegal rune 227 return 0, 1 228 } 229 230 // lookup{{if eq .SourceType "string"}}String{{end}}Unsafe returns the trie value for the first UTF-8 encoding in s. 231 // s must start with a full and valid UTF-8 encoded rune. 232 func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}Unsafe(s {{.SourceType}}) {{.ValueType}} { 233 c0 := s[0] 234 if c0 < 0x80 { // is ASCII 235 return {{.ASCIIBlock}}[c0] 236 } 237 i := {{.StarterBlock}}[c0] 238 if c0 < 0xE0 { // 2-byte UTF-8 239 return t.lookupValue(uint32(i), s[1]) 240 } 241 i = {{.Name}}Index[uint32(i)<<6+uint32(s[1])] 242 if c0 < 0xF0 { // 3-byte UTF-8 243 return t.lookupValue(uint32(i), s[2]) 244 } 245 i = {{.Name}}Index[uint32(i)<<6+uint32(s[2])] 246 if c0 < 0xF8 { // 4-byte UTF-8 247 return t.lookupValue(uint32(i), s[3]) 248 } 249 return 0 250 } 251 `