github.com/go-xe2/third@v1.0.3/golang.org/x/text/encoding/charmap/maketables.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 package main 8 9 import ( 10 "bufio" 11 "fmt" 12 "log" 13 "net/http" 14 "sort" 15 "strings" 16 "unicode/utf8" 17 18 "github.com/go-xe2/third/golang.org/x/text/encoding" 19 "github.com/go-xe2/third/golang.org/x/text/internal/gen" 20 ) 21 22 const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + 23 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + 24 ` !"#$%&'()*+,-./0123456789:;<=>?` + 25 `@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` + 26 "`abcdefghijklmnopqrstuvwxyz{|}~\u007f" 27 28 var encodings = []struct { 29 name string 30 mib string 31 comment string 32 varName string 33 replacement byte 34 mapping string 35 }{ 36 { 37 "IBM Code Page 037", 38 "IBM037", 39 "", 40 "CodePage037", 41 0x3f, 42 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm", 43 }, 44 { 45 "IBM Code Page 437", 46 "PC8CodePage437", 47 "", 48 "CodePage437", 49 encoding.ASCIISub, 50 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm", 51 }, 52 { 53 "IBM Code Page 850", 54 "PC850Multilingual", 55 "", 56 "CodePage850", 57 encoding.ASCIISub, 58 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm", 59 }, 60 { 61 "IBM Code Page 852", 62 "PCp852", 63 "", 64 "CodePage852", 65 encoding.ASCIISub, 66 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm", 67 }, 68 { 69 "IBM Code Page 855", 70 "IBM855", 71 "", 72 "CodePage855", 73 encoding.ASCIISub, 74 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm", 75 }, 76 { 77 "Windows Code Page 858", // PC latin1 with Euro 78 "IBM00858", 79 "", 80 "CodePage858", 81 encoding.ASCIISub, 82 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm", 83 }, 84 { 85 "IBM Code Page 860", 86 "IBM860", 87 "", 88 "CodePage860", 89 encoding.ASCIISub, 90 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm", 91 }, 92 { 93 "IBM Code Page 862", 94 "PC862LatinHebrew", 95 "", 96 "CodePage862", 97 encoding.ASCIISub, 98 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm", 99 }, 100 { 101 "IBM Code Page 863", 102 "IBM863", 103 "", 104 "CodePage863", 105 encoding.ASCIISub, 106 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm", 107 }, 108 { 109 "IBM Code Page 865", 110 "IBM865", 111 "", 112 "CodePage865", 113 encoding.ASCIISub, 114 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm", 115 }, 116 { 117 "IBM Code Page 866", 118 "IBM866", 119 "", 120 "CodePage866", 121 encoding.ASCIISub, 122 "http://encoding.spec.whatwg.org/index-ibm866.txt", 123 }, 124 { 125 "IBM Code Page 1047", 126 "IBM1047", 127 "", 128 "CodePage1047", 129 0x3f, 130 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm", 131 }, 132 { 133 "IBM Code Page 1140", 134 "IBM01140", 135 "", 136 "CodePage1140", 137 0x3f, 138 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm", 139 }, 140 { 141 "ISO 8859-1", 142 "ISOLatin1", 143 "", 144 "ISO8859_1", 145 encoding.ASCIISub, 146 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm", 147 }, 148 { 149 "ISO 8859-2", 150 "ISOLatin2", 151 "", 152 "ISO8859_2", 153 encoding.ASCIISub, 154 "http://encoding.spec.whatwg.org/index-iso-8859-2.txt", 155 }, 156 { 157 "ISO 8859-3", 158 "ISOLatin3", 159 "", 160 "ISO8859_3", 161 encoding.ASCIISub, 162 "http://encoding.spec.whatwg.org/index-iso-8859-3.txt", 163 }, 164 { 165 "ISO 8859-4", 166 "ISOLatin4", 167 "", 168 "ISO8859_4", 169 encoding.ASCIISub, 170 "http://encoding.spec.whatwg.org/index-iso-8859-4.txt", 171 }, 172 { 173 "ISO 8859-5", 174 "ISOLatinCyrillic", 175 "", 176 "ISO8859_5", 177 encoding.ASCIISub, 178 "http://encoding.spec.whatwg.org/index-iso-8859-5.txt", 179 }, 180 { 181 "ISO 8859-6", 182 "ISOLatinArabic", 183 "", 184 "ISO8859_6,ISO8859_6E,ISO8859_6I", 185 encoding.ASCIISub, 186 "http://encoding.spec.whatwg.org/index-iso-8859-6.txt", 187 }, 188 { 189 "ISO 8859-7", 190 "ISOLatinGreek", 191 "", 192 "ISO8859_7", 193 encoding.ASCIISub, 194 "http://encoding.spec.whatwg.org/index-iso-8859-7.txt", 195 }, 196 { 197 "ISO 8859-8", 198 "ISOLatinHebrew", 199 "", 200 "ISO8859_8,ISO8859_8E,ISO8859_8I", 201 encoding.ASCIISub, 202 "http://encoding.spec.whatwg.org/index-iso-8859-8.txt", 203 }, 204 { 205 "ISO 8859-9", 206 "ISOLatin5", 207 "", 208 "ISO8859_9", 209 encoding.ASCIISub, 210 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm", 211 }, 212 { 213 "ISO 8859-10", 214 "ISOLatin6", 215 "", 216 "ISO8859_10", 217 encoding.ASCIISub, 218 "http://encoding.spec.whatwg.org/index-iso-8859-10.txt", 219 }, 220 { 221 "ISO 8859-13", 222 "ISO885913", 223 "", 224 "ISO8859_13", 225 encoding.ASCIISub, 226 "http://encoding.spec.whatwg.org/index-iso-8859-13.txt", 227 }, 228 { 229 "ISO 8859-14", 230 "ISO885914", 231 "", 232 "ISO8859_14", 233 encoding.ASCIISub, 234 "http://encoding.spec.whatwg.org/index-iso-8859-14.txt", 235 }, 236 { 237 "ISO 8859-15", 238 "ISO885915", 239 "", 240 "ISO8859_15", 241 encoding.ASCIISub, 242 "http://encoding.spec.whatwg.org/index-iso-8859-15.txt", 243 }, 244 { 245 "ISO 8859-16", 246 "ISO885916", 247 "", 248 "ISO8859_16", 249 encoding.ASCIISub, 250 "http://encoding.spec.whatwg.org/index-iso-8859-16.txt", 251 }, 252 { 253 "KOI8-R", 254 "KOI8R", 255 "", 256 "KOI8R", 257 encoding.ASCIISub, 258 "http://encoding.spec.whatwg.org/index-koi8-r.txt", 259 }, 260 { 261 "KOI8-U", 262 "KOI8U", 263 "", 264 "KOI8U", 265 encoding.ASCIISub, 266 "http://encoding.spec.whatwg.org/index-koi8-u.txt", 267 }, 268 { 269 "Macintosh", 270 "Macintosh", 271 "", 272 "Macintosh", 273 encoding.ASCIISub, 274 "http://encoding.spec.whatwg.org/index-macintosh.txt", 275 }, 276 { 277 "Macintosh Cyrillic", 278 "MacintoshCyrillic", 279 "", 280 "MacintoshCyrillic", 281 encoding.ASCIISub, 282 "http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt", 283 }, 284 { 285 "Windows 874", 286 "Windows874", 287 "", 288 "Windows874", 289 encoding.ASCIISub, 290 "http://encoding.spec.whatwg.org/index-windows-874.txt", 291 }, 292 { 293 "Windows 1250", 294 "Windows1250", 295 "", 296 "Windows1250", 297 encoding.ASCIISub, 298 "http://encoding.spec.whatwg.org/index-windows-1250.txt", 299 }, 300 { 301 "Windows 1251", 302 "Windows1251", 303 "", 304 "Windows1251", 305 encoding.ASCIISub, 306 "http://encoding.spec.whatwg.org/index-windows-1251.txt", 307 }, 308 { 309 "Windows 1252", 310 "Windows1252", 311 "", 312 "Windows1252", 313 encoding.ASCIISub, 314 "http://encoding.spec.whatwg.org/index-windows-1252.txt", 315 }, 316 { 317 "Windows 1253", 318 "Windows1253", 319 "", 320 "Windows1253", 321 encoding.ASCIISub, 322 "http://encoding.spec.whatwg.org/index-windows-1253.txt", 323 }, 324 { 325 "Windows 1254", 326 "Windows1254", 327 "", 328 "Windows1254", 329 encoding.ASCIISub, 330 "http://encoding.spec.whatwg.org/index-windows-1254.txt", 331 }, 332 { 333 "Windows 1255", 334 "Windows1255", 335 "", 336 "Windows1255", 337 encoding.ASCIISub, 338 "http://encoding.spec.whatwg.org/index-windows-1255.txt", 339 }, 340 { 341 "Windows 1256", 342 "Windows1256", 343 "", 344 "Windows1256", 345 encoding.ASCIISub, 346 "http://encoding.spec.whatwg.org/index-windows-1256.txt", 347 }, 348 { 349 "Windows 1257", 350 "Windows1257", 351 "", 352 "Windows1257", 353 encoding.ASCIISub, 354 "http://encoding.spec.whatwg.org/index-windows-1257.txt", 355 }, 356 { 357 "Windows 1258", 358 "Windows1258", 359 "", 360 "Windows1258", 361 encoding.ASCIISub, 362 "http://encoding.spec.whatwg.org/index-windows-1258.txt", 363 }, 364 { 365 "X-User-Defined", 366 "XUserDefined", 367 "It is defined at http://encoding.spec.whatwg.org/#x-user-defined", 368 "XUserDefined", 369 encoding.ASCIISub, 370 ascii + 371 "\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" + 372 "\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" + 373 "\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" + 374 "\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" + 375 "\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" + 376 "\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" + 377 "\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" + 378 "\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" + 379 "\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" + 380 "\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" + 381 "\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" + 382 "\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" + 383 "\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" + 384 "\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" + 385 "\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" + 386 "\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff", 387 }, 388 } 389 390 func getWHATWG(url string) string { 391 res, err := http.Get(url) 392 if err != nil { 393 log.Fatalf("%q: Get: %v", url, err) 394 } 395 defer res.Body.Close() 396 397 mapping := make([]rune, 128) 398 for i := range mapping { 399 mapping[i] = '\ufffd' 400 } 401 402 scanner := bufio.NewScanner(res.Body) 403 for scanner.Scan() { 404 s := strings.TrimSpace(scanner.Text()) 405 if s == "" || s[0] == '#' { 406 continue 407 } 408 x, y := 0, 0 409 if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil { 410 log.Fatalf("could not parse %q", s) 411 } 412 if x < 0 || 128 <= x { 413 log.Fatalf("code %d is out of range", x) 414 } 415 if 0x80 <= y && y < 0xa0 { 416 // We diverge from the WHATWG spec by mapping control characters 417 // in the range [0x80, 0xa0) to U+FFFD. 418 continue 419 } 420 mapping[x] = rune(y) 421 } 422 return ascii + string(mapping) 423 } 424 425 func getUCM(url string) string { 426 res, err := http.Get(url) 427 if err != nil { 428 log.Fatalf("%q: Get: %v", url, err) 429 } 430 defer res.Body.Close() 431 432 mapping := make([]rune, 256) 433 for i := range mapping { 434 mapping[i] = '\ufffd' 435 } 436 437 charsFound := 0 438 scanner := bufio.NewScanner(res.Body) 439 for scanner.Scan() { 440 s := strings.TrimSpace(scanner.Text()) 441 if s == "" || s[0] == '#' { 442 continue 443 } 444 var c byte 445 var r rune 446 if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil { 447 continue 448 } 449 mapping[c] = r 450 charsFound++ 451 } 452 453 if charsFound < 200 { 454 log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound) 455 } 456 457 return string(mapping) 458 } 459 460 func main() { 461 mibs := map[string]bool{} 462 all := []string{} 463 464 w := gen.NewCodeWriter() 465 defer w.WriteGoFile("tables.go", "charmap") 466 467 printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) } 468 469 printf("import (\n") 470 printf("\t\"github.com/go-xe2/third/golang.org/x/text/encoding\"\n") 471 printf("\t\"github.com/go-xe2/third/golang.org/x/text/encoding/internal/identifier\"\n") 472 printf(")\n\n") 473 for _, e := range encodings { 474 varNames := strings.Split(e.varName, ",") 475 all = append(all, varNames...) 476 varName := varNames[0] 477 switch { 478 case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"): 479 e.mapping = getWHATWG(e.mapping) 480 case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"): 481 e.mapping = getUCM(e.mapping) 482 } 483 484 asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00 485 if asciiSuperset { 486 low = 0x80 487 } 488 lvn := 1 489 if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") { 490 lvn = 3 491 } 492 lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:] 493 printf("// %s is the %s encoding.\n", varName, e.name) 494 if e.comment != "" { 495 printf("//\n// %s\n", e.comment) 496 } 497 printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n", 498 varName, lowerVarName, lowerVarName, e.name) 499 if mibs[e.mib] { 500 log.Fatalf("MIB type %q declared multiple times.", e.mib) 501 } 502 printf("mib: identifier.%s,\n", e.mib) 503 printf("asciiSuperset: %t,\n", asciiSuperset) 504 printf("low: 0x%02x,\n", low) 505 printf("replacement: 0x%02x,\n", e.replacement) 506 507 printf("decode: [256]utf8Enc{\n") 508 i, backMapping := 0, map[rune]byte{} 509 for _, c := range e.mapping { 510 if _, ok := backMapping[c]; !ok && c != utf8.RuneError { 511 backMapping[c] = byte(i) 512 } 513 var buf [8]byte 514 n := utf8.EncodeRune(buf[:], c) 515 if n > 3 { 516 panic(fmt.Sprintf("rune %q (%U) is too long", c, c)) 517 } 518 printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2]) 519 if i%2 == 1 { 520 printf("\n") 521 } 522 i++ 523 } 524 printf("},\n") 525 526 printf("encode: [256]uint32{\n") 527 encode := make([]uint32, 0, 256) 528 for c, i := range backMapping { 529 encode = append(encode, uint32(i)<<24|uint32(c)) 530 } 531 sort.Sort(byRune(encode)) 532 for len(encode) < cap(encode) { 533 encode = append(encode, encode[len(encode)-1]) 534 } 535 for i, enc := range encode { 536 printf("0x%08x,", enc) 537 if i%8 == 7 { 538 printf("\n") 539 } 540 } 541 printf("},\n}\n") 542 543 // Add an estimate of the size of a single Charmap{} struct value, which 544 // includes two 256 elem arrays of 4 bytes and some extra fields, which 545 // align to 3 uint64s on 64-bit architectures. 546 w.Size += 2*4*256 + 3*8 547 } 548 // TODO: add proper line breaking. 549 printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n")) 550 } 551 552 type byRune []uint32 553 554 func (b byRune) Len() int { return len(b) } 555 func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff } 556 func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }