github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/encoding/charmap/maketables.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ignore 6 // +build ignore 7 8 package main 9 10 import ( 11 "bufio" 12 "fmt" 13 "log" 14 "net/http" 15 "sort" 16 "strings" 17 "unicode/utf8" 18 19 "github.com/go-enjin/golang-org-x-text/encoding" 20 "github.com/go-enjin/golang-org-x-text/internal/gen" 21 ) 22 23 const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + 24 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + 25 ` !"#$%&'()*+,-./0123456789:;<=>?` + 26 `@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` + 27 "`abcdefghijklmnopqrstuvwxyz{|}~\u007f" 28 29 var encodings = []struct { 30 name string 31 mib string 32 comment string 33 varName string 34 replacement byte 35 mapping string 36 }{ 37 { 38 "IBM Code Page 037", 39 "IBM037", 40 "", 41 "CodePage037", 42 0x3f, 43 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm", 44 }, 45 { 46 "IBM Code Page 437", 47 "PC8CodePage437", 48 "", 49 "CodePage437", 50 encoding.ASCIISub, 51 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm", 52 }, 53 { 54 "IBM Code Page 850", 55 "PC850Multilingual", 56 "", 57 "CodePage850", 58 encoding.ASCIISub, 59 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm", 60 }, 61 { 62 "IBM Code Page 852", 63 "PCp852", 64 "", 65 "CodePage852", 66 encoding.ASCIISub, 67 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm", 68 }, 69 { 70 "IBM Code Page 855", 71 "IBM855", 72 "", 73 "CodePage855", 74 encoding.ASCIISub, 75 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm", 76 }, 77 { 78 "Windows Code Page 858", // PC latin1 with Euro 79 "IBM00858", 80 "", 81 "CodePage858", 82 encoding.ASCIISub, 83 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm", 84 }, 85 { 86 "IBM Code Page 860", 87 "IBM860", 88 "", 89 "CodePage860", 90 encoding.ASCIISub, 91 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm", 92 }, 93 { 94 "IBM Code Page 862", 95 "PC862LatinHebrew", 96 "", 97 "CodePage862", 98 encoding.ASCIISub, 99 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm", 100 }, 101 { 102 "IBM Code Page 863", 103 "IBM863", 104 "", 105 "CodePage863", 106 encoding.ASCIISub, 107 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm", 108 }, 109 { 110 "IBM Code Page 865", 111 "IBM865", 112 "", 113 "CodePage865", 114 encoding.ASCIISub, 115 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm", 116 }, 117 { 118 "IBM Code Page 866", 119 "IBM866", 120 "", 121 "CodePage866", 122 encoding.ASCIISub, 123 "http://encoding.spec.whatwg.org/index-ibm866.txt", 124 }, 125 { 126 "IBM Code Page 1047", 127 "IBM1047", 128 "", 129 "CodePage1047", 130 0x3f, 131 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm", 132 }, 133 { 134 "IBM Code Page 1140", 135 "IBM01140", 136 "", 137 "CodePage1140", 138 0x3f, 139 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm", 140 }, 141 { 142 "ISO 8859-1", 143 "ISOLatin1", 144 "", 145 "ISO8859_1", 146 encoding.ASCIISub, 147 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm", 148 }, 149 { 150 "ISO 8859-2", 151 "ISOLatin2", 152 "", 153 "ISO8859_2", 154 encoding.ASCIISub, 155 "http://encoding.spec.whatwg.org/index-iso-8859-2.txt", 156 }, 157 { 158 "ISO 8859-3", 159 "ISOLatin3", 160 "", 161 "ISO8859_3", 162 encoding.ASCIISub, 163 "http://encoding.spec.whatwg.org/index-iso-8859-3.txt", 164 }, 165 { 166 "ISO 8859-4", 167 "ISOLatin4", 168 "", 169 "ISO8859_4", 170 encoding.ASCIISub, 171 "http://encoding.spec.whatwg.org/index-iso-8859-4.txt", 172 }, 173 { 174 "ISO 8859-5", 175 "ISOLatinCyrillic", 176 "", 177 "ISO8859_5", 178 encoding.ASCIISub, 179 "http://encoding.spec.whatwg.org/index-iso-8859-5.txt", 180 }, 181 { 182 "ISO 8859-6", 183 "ISOLatinArabic", 184 "", 185 "ISO8859_6,ISO8859_6E,ISO8859_6I", 186 encoding.ASCIISub, 187 "http://encoding.spec.whatwg.org/index-iso-8859-6.txt", 188 }, 189 { 190 "ISO 8859-7", 191 "ISOLatinGreek", 192 "", 193 "ISO8859_7", 194 encoding.ASCIISub, 195 "http://encoding.spec.whatwg.org/index-iso-8859-7.txt", 196 }, 197 { 198 "ISO 8859-8", 199 "ISOLatinHebrew", 200 "", 201 "ISO8859_8,ISO8859_8E,ISO8859_8I", 202 encoding.ASCIISub, 203 "http://encoding.spec.whatwg.org/index-iso-8859-8.txt", 204 }, 205 { 206 "ISO 8859-9", 207 "ISOLatin5", 208 "", 209 "ISO8859_9", 210 encoding.ASCIISub, 211 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm", 212 }, 213 { 214 "ISO 8859-10", 215 "ISOLatin6", 216 "", 217 "ISO8859_10", 218 encoding.ASCIISub, 219 "http://encoding.spec.whatwg.org/index-iso-8859-10.txt", 220 }, 221 { 222 "ISO 8859-13", 223 "ISO885913", 224 "", 225 "ISO8859_13", 226 encoding.ASCIISub, 227 "http://encoding.spec.whatwg.org/index-iso-8859-13.txt", 228 }, 229 { 230 "ISO 8859-14", 231 "ISO885914", 232 "", 233 "ISO8859_14", 234 encoding.ASCIISub, 235 "http://encoding.spec.whatwg.org/index-iso-8859-14.txt", 236 }, 237 { 238 "ISO 8859-15", 239 "ISO885915", 240 "", 241 "ISO8859_15", 242 encoding.ASCIISub, 243 "http://encoding.spec.whatwg.org/index-iso-8859-15.txt", 244 }, 245 { 246 "ISO 8859-16", 247 "ISO885916", 248 "", 249 "ISO8859_16", 250 encoding.ASCIISub, 251 "http://encoding.spec.whatwg.org/index-iso-8859-16.txt", 252 }, 253 { 254 "KOI8-R", 255 "KOI8R", 256 "", 257 "KOI8R", 258 encoding.ASCIISub, 259 "http://encoding.spec.whatwg.org/index-koi8-r.txt", 260 }, 261 { 262 "KOI8-U", 263 "KOI8U", 264 "", 265 "KOI8U", 266 encoding.ASCIISub, 267 "http://encoding.spec.whatwg.org/index-koi8-u.txt", 268 }, 269 { 270 "Macintosh", 271 "Macintosh", 272 "", 273 "Macintosh", 274 encoding.ASCIISub, 275 "http://encoding.spec.whatwg.org/index-macintosh.txt", 276 }, 277 { 278 "Macintosh Cyrillic", 279 "MacintoshCyrillic", 280 "", 281 "MacintoshCyrillic", 282 encoding.ASCIISub, 283 "http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt", 284 }, 285 { 286 "Windows 874", 287 "Windows874", 288 "", 289 "Windows874", 290 encoding.ASCIISub, 291 "http://encoding.spec.whatwg.org/index-windows-874.txt", 292 }, 293 { 294 "Windows 1250", 295 "Windows1250", 296 "", 297 "Windows1250", 298 encoding.ASCIISub, 299 "http://encoding.spec.whatwg.org/index-windows-1250.txt", 300 }, 301 { 302 "Windows 1251", 303 "Windows1251", 304 "", 305 "Windows1251", 306 encoding.ASCIISub, 307 "http://encoding.spec.whatwg.org/index-windows-1251.txt", 308 }, 309 { 310 "Windows 1252", 311 "Windows1252", 312 "", 313 "Windows1252", 314 encoding.ASCIISub, 315 "http://encoding.spec.whatwg.org/index-windows-1252.txt", 316 }, 317 { 318 "Windows 1253", 319 "Windows1253", 320 "", 321 "Windows1253", 322 encoding.ASCIISub, 323 "http://encoding.spec.whatwg.org/index-windows-1253.txt", 324 }, 325 { 326 "Windows 1254", 327 "Windows1254", 328 "", 329 "Windows1254", 330 encoding.ASCIISub, 331 "http://encoding.spec.whatwg.org/index-windows-1254.txt", 332 }, 333 { 334 "Windows 1255", 335 "Windows1255", 336 "", 337 "Windows1255", 338 encoding.ASCIISub, 339 "http://encoding.spec.whatwg.org/index-windows-1255.txt", 340 }, 341 { 342 "Windows 1256", 343 "Windows1256", 344 "", 345 "Windows1256", 346 encoding.ASCIISub, 347 "http://encoding.spec.whatwg.org/index-windows-1256.txt", 348 }, 349 { 350 "Windows 1257", 351 "Windows1257", 352 "", 353 "Windows1257", 354 encoding.ASCIISub, 355 "http://encoding.spec.whatwg.org/index-windows-1257.txt", 356 }, 357 { 358 "Windows 1258", 359 "Windows1258", 360 "", 361 "Windows1258", 362 encoding.ASCIISub, 363 "http://encoding.spec.whatwg.org/index-windows-1258.txt", 364 }, 365 { 366 "X-User-Defined", 367 "XUserDefined", 368 "It is defined at http://encoding.spec.whatwg.org/#x-user-defined", 369 "XUserDefined", 370 encoding.ASCIISub, 371 ascii + 372 "\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" + 373 "\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" + 374 "\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" + 375 "\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" + 376 "\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" + 377 "\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" + 378 "\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" + 379 "\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" + 380 "\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" + 381 "\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" + 382 "\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" + 383 "\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" + 384 "\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" + 385 "\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" + 386 "\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" + 387 "\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff", 388 }, 389 } 390 391 func getWHATWG(url string) string { 392 res, err := http.Get(url) 393 if err != nil { 394 log.Fatalf("%q: Get: %v", url, err) 395 } 396 defer res.Body.Close() 397 398 mapping := make([]rune, 128) 399 for i := range mapping { 400 mapping[i] = '\ufffd' 401 } 402 403 scanner := bufio.NewScanner(res.Body) 404 for scanner.Scan() { 405 s := strings.TrimSpace(scanner.Text()) 406 if s == "" || s[0] == '#' { 407 continue 408 } 409 x, y := 0, 0 410 if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil { 411 log.Fatalf("could not parse %q", s) 412 } 413 if x < 0 || 128 <= x { 414 log.Fatalf("code %d is out of range", x) 415 } 416 if 0x80 <= y && y < 0xa0 { 417 // We diverge from the WHATWG spec by mapping control characters 418 // in the range [0x80, 0xa0) to U+FFFD. 419 continue 420 } 421 mapping[x] = rune(y) 422 } 423 return ascii + string(mapping) 424 } 425 426 func getUCM(url string) string { 427 res, err := http.Get(url) 428 if err != nil { 429 log.Fatalf("%q: Get: %v", url, err) 430 } 431 defer res.Body.Close() 432 433 mapping := make([]rune, 256) 434 for i := range mapping { 435 mapping[i] = '\ufffd' 436 } 437 438 charsFound := 0 439 scanner := bufio.NewScanner(res.Body) 440 for scanner.Scan() { 441 s := strings.TrimSpace(scanner.Text()) 442 if s == "" || s[0] == '#' { 443 continue 444 } 445 var c byte 446 var r rune 447 if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil { 448 continue 449 } 450 mapping[c] = r 451 charsFound++ 452 } 453 454 if charsFound < 200 { 455 log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound) 456 } 457 458 return string(mapping) 459 } 460 461 func main() { 462 mibs := map[string]bool{} 463 all := []string{} 464 465 w := gen.NewCodeWriter() 466 defer w.WriteGoFile("tables.go", "charmap") 467 468 printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) } 469 470 printf("import (\n") 471 printf("\t\"github.com/go-enjin/golang-org-x-text/encoding\"\n") 472 printf("\t\"github.com/go-enjin/golang-org-x-text/encoding/internal/identifier\"\n") 473 printf(")\n\n") 474 for _, e := range encodings { 475 varNames := strings.Split(e.varName, ",") 476 all = append(all, varNames...) 477 varName := varNames[0] 478 switch { 479 case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"): 480 e.mapping = getWHATWG(e.mapping) 481 case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"): 482 e.mapping = getUCM(e.mapping) 483 } 484 485 asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00 486 if asciiSuperset { 487 low = 0x80 488 } 489 lvn := 1 490 if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") { 491 lvn = 3 492 } 493 lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:] 494 printf("// %s is the %s encoding.\n", varName, e.name) 495 if e.comment != "" { 496 printf("//\n// %s\n", e.comment) 497 } 498 printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n", 499 varName, lowerVarName, lowerVarName, e.name) 500 if mibs[e.mib] { 501 log.Fatalf("MIB type %q declared multiple times.", e.mib) 502 } 503 printf("mib: identifier.%s,\n", e.mib) 504 printf("asciiSuperset: %t,\n", asciiSuperset) 505 printf("low: 0x%02x,\n", low) 506 printf("replacement: 0x%02x,\n", e.replacement) 507 508 printf("decode: [256]utf8Enc{\n") 509 i, backMapping := 0, map[rune]byte{} 510 for _, c := range e.mapping { 511 if _, ok := backMapping[c]; !ok && c != utf8.RuneError { 512 backMapping[c] = byte(i) 513 } 514 var buf [8]byte 515 n := utf8.EncodeRune(buf[:], c) 516 if n > 3 { 517 panic(fmt.Sprintf("rune %q (%U) is too long", c, c)) 518 } 519 printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2]) 520 if i%2 == 1 { 521 printf("\n") 522 } 523 i++ 524 } 525 printf("},\n") 526 527 printf("encode: [256]uint32{\n") 528 encode := make([]uint32, 0, 256) 529 for c, i := range backMapping { 530 encode = append(encode, uint32(i)<<24|uint32(c)) 531 } 532 sort.Sort(byRune(encode)) 533 for len(encode) < cap(encode) { 534 encode = append(encode, encode[len(encode)-1]) 535 } 536 for i, enc := range encode { 537 printf("0x%08x,", enc) 538 if i%8 == 7 { 539 printf("\n") 540 } 541 } 542 printf("},\n}\n") 543 544 // Add an estimate of the size of a single Charmap{} struct value, which 545 // includes two 256 elem arrays of 4 bytes and some extra fields, which 546 // align to 3 uint64s on 64-bit architectures. 547 w.Size += 2*4*256 + 3*8 548 } 549 // TODO: add proper line breaking. 550 printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n")) 551 } 552 553 type byRune []uint32 554 555 func (b byRune) Len() int { return len(b) } 556 func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff } 557 func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }