github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/encoding/charmap/maketables.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  package main
     8  
     9  import (
    10  	"bufio"
    11  	"fmt"
    12  	"log"
    13  	"net/http"
    14  	"sort"
    15  	"strings"
    16  	"unicode/utf8"
    17  
    18  	"golang.org/x/text/encoding"
    19  	"golang.org/x/text/internal/gen"
    20  )
    21  
    22  const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
    23  	"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
    24  	` !"#$%&'()*+,-./0123456789:;<=>?` +
    25  	`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
    26  	"`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
    27  
    28  var encodings = []struct {
    29  	name        string
    30  	mib         string
    31  	comment     string
    32  	varName     string
    33  	replacement byte
    34  	mapping     string
    35  }{
    36  	{
    37  		"IBM Code Page 437",
    38  		"PC8CodePage437",
    39  		"",
    40  		"CodePage437",
    41  		encoding.ASCIISub,
    42  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
    43  	},
    44  	{
    45  		"IBM Code Page 850",
    46  		"PC850Multilingual",
    47  		"",
    48  		"CodePage850",
    49  		encoding.ASCIISub,
    50  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
    51  	},
    52  	{
    53  		"IBM Code Page 852",
    54  		"PCp852",
    55  		"",
    56  		"CodePage852",
    57  		encoding.ASCIISub,
    58  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
    59  	},
    60  	{
    61  		"IBM Code Page 855",
    62  		"IBM855",
    63  		"",
    64  		"CodePage855",
    65  		encoding.ASCIISub,
    66  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
    67  	},
    68  	{
    69  		"Windows Code Page 858", // PC latin1 with Euro
    70  		"IBM00858",
    71  		"",
    72  		"CodePage858",
    73  		encoding.ASCIISub,
    74  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
    75  	},
    76  	{
    77  		"IBM Code Page 862",
    78  		"PC862LatinHebrew",
    79  		"",
    80  		"CodePage862",
    81  		encoding.ASCIISub,
    82  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
    83  	},
    84  	{
    85  		"IBM Code Page 866",
    86  		"IBM866",
    87  		"",
    88  		"CodePage866",
    89  		encoding.ASCIISub,
    90  		"http://encoding.spec.whatwg.org/index-ibm866.txt",
    91  	},
    92  	{
    93  		"ISO 8859-2",
    94  		"ISOLatin2",
    95  		"",
    96  		"ISO8859_2",
    97  		encoding.ASCIISub,
    98  		"http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
    99  	},
   100  	{
   101  		"ISO 8859-3",
   102  		"ISOLatin3",
   103  		"",
   104  		"ISO8859_3",
   105  		encoding.ASCIISub,
   106  		"http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
   107  	},
   108  	{
   109  		"ISO 8859-4",
   110  		"ISOLatin4",
   111  		"",
   112  		"ISO8859_4",
   113  		encoding.ASCIISub,
   114  		"http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
   115  	},
   116  	{
   117  		"ISO 8859-5",
   118  		"ISOLatinCyrillic",
   119  		"",
   120  		"ISO8859_5",
   121  		encoding.ASCIISub,
   122  		"http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
   123  	},
   124  	{
   125  		"ISO 8859-6",
   126  		"ISOLatinArabic",
   127  		"",
   128  		"ISO8859_6,ISO8859_6E,ISO8859_6I",
   129  		encoding.ASCIISub,
   130  		"http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
   131  	},
   132  	{
   133  		"ISO 8859-7",
   134  		"ISOLatinGreek",
   135  		"",
   136  		"ISO8859_7",
   137  		encoding.ASCIISub,
   138  		"http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
   139  	},
   140  	{
   141  		"ISO 8859-8",
   142  		"ISOLatinHebrew",
   143  		"",
   144  		"ISO8859_8,ISO8859_8E,ISO8859_8I",
   145  		encoding.ASCIISub,
   146  		"http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
   147  	},
   148  	{
   149  		"ISO 8859-10",
   150  		"ISOLatin6",
   151  		"",
   152  		"ISO8859_10",
   153  		encoding.ASCIISub,
   154  		"http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
   155  	},
   156  	{
   157  		"ISO 8859-13",
   158  		"ISO885913",
   159  		"",
   160  		"ISO8859_13",
   161  		encoding.ASCIISub,
   162  		"http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
   163  	},
   164  	{
   165  		"ISO 8859-14",
   166  		"ISO885914",
   167  		"",
   168  		"ISO8859_14",
   169  		encoding.ASCIISub,
   170  		"http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
   171  	},
   172  	{
   173  		"ISO 8859-15",
   174  		"ISO885915",
   175  		"",
   176  		"ISO8859_15",
   177  		encoding.ASCIISub,
   178  		"http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
   179  	},
   180  	{
   181  		"ISO 8859-16",
   182  		"ISO885916",
   183  		"",
   184  		"ISO8859_16",
   185  		encoding.ASCIISub,
   186  		"http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
   187  	},
   188  	{
   189  		"KOI8-R",
   190  		"KOI8R",
   191  		"",
   192  		"KOI8R",
   193  		encoding.ASCIISub,
   194  		"http://encoding.spec.whatwg.org/index-koi8-r.txt",
   195  	},
   196  	{
   197  		"KOI8-U",
   198  		"KOI8U",
   199  		"",
   200  		"KOI8U",
   201  		encoding.ASCIISub,
   202  		"http://encoding.spec.whatwg.org/index-koi8-u.txt",
   203  	},
   204  	{
   205  		"Macintosh",
   206  		"Macintosh",
   207  		"",
   208  		"Macintosh",
   209  		encoding.ASCIISub,
   210  		"http://encoding.spec.whatwg.org/index-macintosh.txt",
   211  	},
   212  	{
   213  		"Macintosh Cyrillic",
   214  		"MacintoshCyrillic",
   215  		"",
   216  		"MacintoshCyrillic",
   217  		encoding.ASCIISub,
   218  		"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
   219  	},
   220  	{
   221  		"Windows 874",
   222  		"Windows874",
   223  		"",
   224  		"Windows874",
   225  		encoding.ASCIISub,
   226  		"http://encoding.spec.whatwg.org/index-windows-874.txt",
   227  	},
   228  	{
   229  		"Windows 1250",
   230  		"Windows1250",
   231  		"",
   232  		"Windows1250",
   233  		encoding.ASCIISub,
   234  		"http://encoding.spec.whatwg.org/index-windows-1250.txt",
   235  	},
   236  	{
   237  		"Windows 1251",
   238  		"Windows1251",
   239  		"",
   240  		"Windows1251",
   241  		encoding.ASCIISub,
   242  		"http://encoding.spec.whatwg.org/index-windows-1251.txt",
   243  	},
   244  	{
   245  		"Windows 1252",
   246  		"Windows1252",
   247  		"",
   248  		"Windows1252",
   249  		encoding.ASCIISub,
   250  		"http://encoding.spec.whatwg.org/index-windows-1252.txt",
   251  	},
   252  	{
   253  		"Windows 1253",
   254  		"Windows1253",
   255  		"",
   256  		"Windows1253",
   257  		encoding.ASCIISub,
   258  		"http://encoding.spec.whatwg.org/index-windows-1253.txt",
   259  	},
   260  	{
   261  		"Windows 1254",
   262  		"Windows1254",
   263  		"",
   264  		"Windows1254",
   265  		encoding.ASCIISub,
   266  		"http://encoding.spec.whatwg.org/index-windows-1254.txt",
   267  	},
   268  	{
   269  		"Windows 1255",
   270  		"Windows1255",
   271  		"",
   272  		"Windows1255",
   273  		encoding.ASCIISub,
   274  		"http://encoding.spec.whatwg.org/index-windows-1255.txt",
   275  	},
   276  	{
   277  		"Windows 1256",
   278  		"Windows1256",
   279  		"",
   280  		"Windows1256",
   281  		encoding.ASCIISub,
   282  		"http://encoding.spec.whatwg.org/index-windows-1256.txt",
   283  	},
   284  	{
   285  		"Windows 1257",
   286  		"Windows1257",
   287  		"",
   288  		"Windows1257",
   289  		encoding.ASCIISub,
   290  		"http://encoding.spec.whatwg.org/index-windows-1257.txt",
   291  	},
   292  	{
   293  		"Windows 1258",
   294  		"Windows1258",
   295  		"",
   296  		"Windows1258",
   297  		encoding.ASCIISub,
   298  		"http://encoding.spec.whatwg.org/index-windows-1258.txt",
   299  	},
   300  	{
   301  		"X-User-Defined",
   302  		"XUserDefined",
   303  		"It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
   304  		"XUserDefined",
   305  		encoding.ASCIISub,
   306  		ascii +
   307  			"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
   308  			"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
   309  			"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
   310  			"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
   311  			"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
   312  			"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
   313  			"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
   314  			"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
   315  			"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
   316  			"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
   317  			"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
   318  			"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
   319  			"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
   320  			"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
   321  			"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
   322  			"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
   323  	},
   324  }
   325  
   326  func getWHATWG(url string) string {
   327  	res, err := http.Get(url)
   328  	if err != nil {
   329  		log.Fatalf("%q: Get: %v", url, err)
   330  	}
   331  	defer res.Body.Close()
   332  
   333  	mapping := make([]rune, 128)
   334  	for i := range mapping {
   335  		mapping[i] = '\ufffd'
   336  	}
   337  
   338  	scanner := bufio.NewScanner(res.Body)
   339  	for scanner.Scan() {
   340  		s := strings.TrimSpace(scanner.Text())
   341  		if s == "" || s[0] == '#' {
   342  			continue
   343  		}
   344  		x, y := 0, 0
   345  		if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
   346  			log.Fatalf("could not parse %q", s)
   347  		}
   348  		if x < 0 || 128 <= x {
   349  			log.Fatalf("code %d is out of range", x)
   350  		}
   351  		if 0x80 <= y && y < 0xa0 {
   352  			// We diverge from the WHATWG spec by mapping control characters
   353  			// in the range [0x80, 0xa0) to U+FFFD.
   354  			continue
   355  		}
   356  		mapping[x] = rune(y)
   357  	}
   358  	return ascii + string(mapping)
   359  }
   360  
   361  func getUCM(url string) string {
   362  	res, err := http.Get(url)
   363  	if err != nil {
   364  		log.Fatalf("%q: Get: %v", url, err)
   365  	}
   366  	defer res.Body.Close()
   367  
   368  	mapping := make([]rune, 256)
   369  	for i := range mapping {
   370  		mapping[i] = '\ufffd'
   371  	}
   372  
   373  	charsFound := 0
   374  	scanner := bufio.NewScanner(res.Body)
   375  	for scanner.Scan() {
   376  		s := strings.TrimSpace(scanner.Text())
   377  		if s == "" || s[0] == '#' {
   378  			continue
   379  		}
   380  		var c byte
   381  		var r rune
   382  		if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
   383  			continue
   384  		}
   385  		mapping[c] = r
   386  		charsFound++
   387  	}
   388  
   389  	if charsFound < 200 {
   390  		log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
   391  	}
   392  
   393  	return string(mapping)
   394  }
   395  
   396  func main() {
   397  	mibs := map[string]bool{}
   398  	all := []string{}
   399  
   400  	w := gen.NewCodeWriter()
   401  	defer w.WriteGoFile("tables.go", "charmap")
   402  
   403  	printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
   404  
   405  	printf("import (\n")
   406  	printf("\t\"golang.org/x/text/encoding\"\n")
   407  	printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
   408  	printf(")\n\n")
   409  	for _, e := range encodings {
   410  		varNames := strings.Split(e.varName, ",")
   411  		all = append(all, varNames...)
   412  		varName := varNames[0]
   413  		switch {
   414  		case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
   415  			e.mapping = getWHATWG(e.mapping)
   416  		case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
   417  			e.mapping = getUCM(e.mapping)
   418  		}
   419  
   420  		asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
   421  		if asciiSuperset {
   422  			low = 0x80
   423  		}
   424  		lvn := 1
   425  		if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
   426  			lvn = 3
   427  		}
   428  		lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
   429  		printf("// %s is the %s encoding.\n", varName, e.name)
   430  		if e.comment != "" {
   431  			printf("//\n// %s\n", e.comment)
   432  		}
   433  		printf("var %s encoding.Encoding = &%s\n\nvar %s = charmap{\nname: %q,\n",
   434  			varName, lowerVarName, lowerVarName, e.name)
   435  		if mibs[e.mib] {
   436  			log.Fatalf("MIB type %q declared multiple times.", e.mib)
   437  		}
   438  		printf("mib: identifier.%s,\n", e.mib)
   439  		printf("asciiSuperset: %t,\n", asciiSuperset)
   440  		printf("low: 0x%02x,\n", low)
   441  		printf("replacement: 0x%02x,\n", e.replacement)
   442  
   443  		printf("decode: [256]utf8Enc{\n")
   444  		i, backMapping := 0, map[rune]byte{}
   445  		for _, c := range e.mapping {
   446  			if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
   447  				backMapping[c] = byte(i)
   448  			}
   449  			var buf [8]byte
   450  			n := utf8.EncodeRune(buf[:], c)
   451  			if n > 3 {
   452  				panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
   453  			}
   454  			printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
   455  			if i%2 == 1 {
   456  				printf("\n")
   457  			}
   458  			i++
   459  		}
   460  		printf("},\n")
   461  
   462  		printf("encode: [256]uint32{\n")
   463  		encode := make([]uint32, 0, 256)
   464  		for c, i := range backMapping {
   465  			encode = append(encode, uint32(i)<<24|uint32(c))
   466  		}
   467  		sort.Sort(byRune(encode))
   468  		for len(encode) < cap(encode) {
   469  			encode = append(encode, encode[len(encode)-1])
   470  		}
   471  		for i, enc := range encode {
   472  			printf("0x%08x,", enc)
   473  			if i%8 == 7 {
   474  				printf("\n")
   475  			}
   476  		}
   477  		printf("},\n}\n")
   478  
   479  		// Add an estimate of the size of a single charmap{} struct value, which
   480  		// includes two 256 elem arrays of 4 bytes and some extra fields, which
   481  		// align to 3 uint64s on 64-bit architectures.
   482  		w.Size += 2*4*256 + 3*8
   483  	}
   484  	// TODO: add proper line breaking.
   485  	printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
   486  }
   487  
   488  type byRune []uint32
   489  
   490  func (b byRune) Len() int           { return len(b) }
   491  func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
   492  func (b byRune) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }