github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/encoding/charmap/maketables.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  // +build ignore
     7  
     8  package main
     9  
    10  import (
    11  	"bufio"
    12  	"fmt"
    13  	"log"
    14  	"net/http"
    15  	"sort"
    16  	"strings"
    17  	"unicode/utf8"
    18  
    19  	"github.com/go-enjin/golang-org-x-text/encoding"
    20  	"github.com/go-enjin/golang-org-x-text/internal/gen"
    21  )
    22  
    23  const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
    24  	"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
    25  	` !"#$%&'()*+,-./0123456789:;<=>?` +
    26  	`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
    27  	"`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
    28  
    29  var encodings = []struct {
    30  	name        string
    31  	mib         string
    32  	comment     string
    33  	varName     string
    34  	replacement byte
    35  	mapping     string
    36  }{
    37  	{
    38  		"IBM Code Page 037",
    39  		"IBM037",
    40  		"",
    41  		"CodePage037",
    42  		0x3f,
    43  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
    44  	},
    45  	{
    46  		"IBM Code Page 437",
    47  		"PC8CodePage437",
    48  		"",
    49  		"CodePage437",
    50  		encoding.ASCIISub,
    51  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
    52  	},
    53  	{
    54  		"IBM Code Page 850",
    55  		"PC850Multilingual",
    56  		"",
    57  		"CodePage850",
    58  		encoding.ASCIISub,
    59  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
    60  	},
    61  	{
    62  		"IBM Code Page 852",
    63  		"PCp852",
    64  		"",
    65  		"CodePage852",
    66  		encoding.ASCIISub,
    67  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
    68  	},
    69  	{
    70  		"IBM Code Page 855",
    71  		"IBM855",
    72  		"",
    73  		"CodePage855",
    74  		encoding.ASCIISub,
    75  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
    76  	},
    77  	{
    78  		"Windows Code Page 858", // PC latin1 with Euro
    79  		"IBM00858",
    80  		"",
    81  		"CodePage858",
    82  		encoding.ASCIISub,
    83  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
    84  	},
    85  	{
    86  		"IBM Code Page 860",
    87  		"IBM860",
    88  		"",
    89  		"CodePage860",
    90  		encoding.ASCIISub,
    91  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
    92  	},
    93  	{
    94  		"IBM Code Page 862",
    95  		"PC862LatinHebrew",
    96  		"",
    97  		"CodePage862",
    98  		encoding.ASCIISub,
    99  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
   100  	},
   101  	{
   102  		"IBM Code Page 863",
   103  		"IBM863",
   104  		"",
   105  		"CodePage863",
   106  		encoding.ASCIISub,
   107  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
   108  	},
   109  	{
   110  		"IBM Code Page 865",
   111  		"IBM865",
   112  		"",
   113  		"CodePage865",
   114  		encoding.ASCIISub,
   115  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
   116  	},
   117  	{
   118  		"IBM Code Page 866",
   119  		"IBM866",
   120  		"",
   121  		"CodePage866",
   122  		encoding.ASCIISub,
   123  		"http://encoding.spec.whatwg.org/index-ibm866.txt",
   124  	},
   125  	{
   126  		"IBM Code Page 1047",
   127  		"IBM1047",
   128  		"",
   129  		"CodePage1047",
   130  		0x3f,
   131  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
   132  	},
   133  	{
   134  		"IBM Code Page 1140",
   135  		"IBM01140",
   136  		"",
   137  		"CodePage1140",
   138  		0x3f,
   139  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
   140  	},
   141  	{
   142  		"ISO 8859-1",
   143  		"ISOLatin1",
   144  		"",
   145  		"ISO8859_1",
   146  		encoding.ASCIISub,
   147  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
   148  	},
   149  	{
   150  		"ISO 8859-2",
   151  		"ISOLatin2",
   152  		"",
   153  		"ISO8859_2",
   154  		encoding.ASCIISub,
   155  		"http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
   156  	},
   157  	{
   158  		"ISO 8859-3",
   159  		"ISOLatin3",
   160  		"",
   161  		"ISO8859_3",
   162  		encoding.ASCIISub,
   163  		"http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
   164  	},
   165  	{
   166  		"ISO 8859-4",
   167  		"ISOLatin4",
   168  		"",
   169  		"ISO8859_4",
   170  		encoding.ASCIISub,
   171  		"http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
   172  	},
   173  	{
   174  		"ISO 8859-5",
   175  		"ISOLatinCyrillic",
   176  		"",
   177  		"ISO8859_5",
   178  		encoding.ASCIISub,
   179  		"http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
   180  	},
   181  	{
   182  		"ISO 8859-6",
   183  		"ISOLatinArabic",
   184  		"",
   185  		"ISO8859_6,ISO8859_6E,ISO8859_6I",
   186  		encoding.ASCIISub,
   187  		"http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
   188  	},
   189  	{
   190  		"ISO 8859-7",
   191  		"ISOLatinGreek",
   192  		"",
   193  		"ISO8859_7",
   194  		encoding.ASCIISub,
   195  		"http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
   196  	},
   197  	{
   198  		"ISO 8859-8",
   199  		"ISOLatinHebrew",
   200  		"",
   201  		"ISO8859_8,ISO8859_8E,ISO8859_8I",
   202  		encoding.ASCIISub,
   203  		"http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
   204  	},
   205  	{
   206  		"ISO 8859-9",
   207  		"ISOLatin5",
   208  		"",
   209  		"ISO8859_9",
   210  		encoding.ASCIISub,
   211  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
   212  	},
   213  	{
   214  		"ISO 8859-10",
   215  		"ISOLatin6",
   216  		"",
   217  		"ISO8859_10",
   218  		encoding.ASCIISub,
   219  		"http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
   220  	},
   221  	{
   222  		"ISO 8859-13",
   223  		"ISO885913",
   224  		"",
   225  		"ISO8859_13",
   226  		encoding.ASCIISub,
   227  		"http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
   228  	},
   229  	{
   230  		"ISO 8859-14",
   231  		"ISO885914",
   232  		"",
   233  		"ISO8859_14",
   234  		encoding.ASCIISub,
   235  		"http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
   236  	},
   237  	{
   238  		"ISO 8859-15",
   239  		"ISO885915",
   240  		"",
   241  		"ISO8859_15",
   242  		encoding.ASCIISub,
   243  		"http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
   244  	},
   245  	{
   246  		"ISO 8859-16",
   247  		"ISO885916",
   248  		"",
   249  		"ISO8859_16",
   250  		encoding.ASCIISub,
   251  		"http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
   252  	},
   253  	{
   254  		"KOI8-R",
   255  		"KOI8R",
   256  		"",
   257  		"KOI8R",
   258  		encoding.ASCIISub,
   259  		"http://encoding.spec.whatwg.org/index-koi8-r.txt",
   260  	},
   261  	{
   262  		"KOI8-U",
   263  		"KOI8U",
   264  		"",
   265  		"KOI8U",
   266  		encoding.ASCIISub,
   267  		"http://encoding.spec.whatwg.org/index-koi8-u.txt",
   268  	},
   269  	{
   270  		"Macintosh",
   271  		"Macintosh",
   272  		"",
   273  		"Macintosh",
   274  		encoding.ASCIISub,
   275  		"http://encoding.spec.whatwg.org/index-macintosh.txt",
   276  	},
   277  	{
   278  		"Macintosh Cyrillic",
   279  		"MacintoshCyrillic",
   280  		"",
   281  		"MacintoshCyrillic",
   282  		encoding.ASCIISub,
   283  		"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
   284  	},
   285  	{
   286  		"Windows 874",
   287  		"Windows874",
   288  		"",
   289  		"Windows874",
   290  		encoding.ASCIISub,
   291  		"http://encoding.spec.whatwg.org/index-windows-874.txt",
   292  	},
   293  	{
   294  		"Windows 1250",
   295  		"Windows1250",
   296  		"",
   297  		"Windows1250",
   298  		encoding.ASCIISub,
   299  		"http://encoding.spec.whatwg.org/index-windows-1250.txt",
   300  	},
   301  	{
   302  		"Windows 1251",
   303  		"Windows1251",
   304  		"",
   305  		"Windows1251",
   306  		encoding.ASCIISub,
   307  		"http://encoding.spec.whatwg.org/index-windows-1251.txt",
   308  	},
   309  	{
   310  		"Windows 1252",
   311  		"Windows1252",
   312  		"",
   313  		"Windows1252",
   314  		encoding.ASCIISub,
   315  		"http://encoding.spec.whatwg.org/index-windows-1252.txt",
   316  	},
   317  	{
   318  		"Windows 1253",
   319  		"Windows1253",
   320  		"",
   321  		"Windows1253",
   322  		encoding.ASCIISub,
   323  		"http://encoding.spec.whatwg.org/index-windows-1253.txt",
   324  	},
   325  	{
   326  		"Windows 1254",
   327  		"Windows1254",
   328  		"",
   329  		"Windows1254",
   330  		encoding.ASCIISub,
   331  		"http://encoding.spec.whatwg.org/index-windows-1254.txt",
   332  	},
   333  	{
   334  		"Windows 1255",
   335  		"Windows1255",
   336  		"",
   337  		"Windows1255",
   338  		encoding.ASCIISub,
   339  		"http://encoding.spec.whatwg.org/index-windows-1255.txt",
   340  	},
   341  	{
   342  		"Windows 1256",
   343  		"Windows1256",
   344  		"",
   345  		"Windows1256",
   346  		encoding.ASCIISub,
   347  		"http://encoding.spec.whatwg.org/index-windows-1256.txt",
   348  	},
   349  	{
   350  		"Windows 1257",
   351  		"Windows1257",
   352  		"",
   353  		"Windows1257",
   354  		encoding.ASCIISub,
   355  		"http://encoding.spec.whatwg.org/index-windows-1257.txt",
   356  	},
   357  	{
   358  		"Windows 1258",
   359  		"Windows1258",
   360  		"",
   361  		"Windows1258",
   362  		encoding.ASCIISub,
   363  		"http://encoding.spec.whatwg.org/index-windows-1258.txt",
   364  	},
   365  	{
   366  		"X-User-Defined",
   367  		"XUserDefined",
   368  		"It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
   369  		"XUserDefined",
   370  		encoding.ASCIISub,
   371  		ascii +
   372  			"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
   373  			"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
   374  			"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
   375  			"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
   376  			"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
   377  			"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
   378  			"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
   379  			"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
   380  			"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
   381  			"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
   382  			"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
   383  			"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
   384  			"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
   385  			"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
   386  			"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
   387  			"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
   388  	},
   389  }
   390  
   391  func getWHATWG(url string) string {
   392  	res, err := http.Get(url)
   393  	if err != nil {
   394  		log.Fatalf("%q: Get: %v", url, err)
   395  	}
   396  	defer res.Body.Close()
   397  
   398  	mapping := make([]rune, 128)
   399  	for i := range mapping {
   400  		mapping[i] = '\ufffd'
   401  	}
   402  
   403  	scanner := bufio.NewScanner(res.Body)
   404  	for scanner.Scan() {
   405  		s := strings.TrimSpace(scanner.Text())
   406  		if s == "" || s[0] == '#' {
   407  			continue
   408  		}
   409  		x, y := 0, 0
   410  		if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
   411  			log.Fatalf("could not parse %q", s)
   412  		}
   413  		if x < 0 || 128 <= x {
   414  			log.Fatalf("code %d is out of range", x)
   415  		}
   416  		if 0x80 <= y && y < 0xa0 {
   417  			// We diverge from the WHATWG spec by mapping control characters
   418  			// in the range [0x80, 0xa0) to U+FFFD.
   419  			continue
   420  		}
   421  		mapping[x] = rune(y)
   422  	}
   423  	return ascii + string(mapping)
   424  }
   425  
   426  func getUCM(url string) string {
   427  	res, err := http.Get(url)
   428  	if err != nil {
   429  		log.Fatalf("%q: Get: %v", url, err)
   430  	}
   431  	defer res.Body.Close()
   432  
   433  	mapping := make([]rune, 256)
   434  	for i := range mapping {
   435  		mapping[i] = '\ufffd'
   436  	}
   437  
   438  	charsFound := 0
   439  	scanner := bufio.NewScanner(res.Body)
   440  	for scanner.Scan() {
   441  		s := strings.TrimSpace(scanner.Text())
   442  		if s == "" || s[0] == '#' {
   443  			continue
   444  		}
   445  		var c byte
   446  		var r rune
   447  		if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
   448  			continue
   449  		}
   450  		mapping[c] = r
   451  		charsFound++
   452  	}
   453  
   454  	if charsFound < 200 {
   455  		log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
   456  	}
   457  
   458  	return string(mapping)
   459  }
   460  
   461  func main() {
   462  	mibs := map[string]bool{}
   463  	all := []string{}
   464  
   465  	w := gen.NewCodeWriter()
   466  	defer w.WriteGoFile("tables.go", "charmap")
   467  
   468  	printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
   469  
   470  	printf("import (\n")
   471  	printf("\t\"github.com/go-enjin/golang-org-x-text/encoding\"\n")
   472  	printf("\t\"github.com/go-enjin/golang-org-x-text/encoding/internal/identifier\"\n")
   473  	printf(")\n\n")
   474  	for _, e := range encodings {
   475  		varNames := strings.Split(e.varName, ",")
   476  		all = append(all, varNames...)
   477  		varName := varNames[0]
   478  		switch {
   479  		case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
   480  			e.mapping = getWHATWG(e.mapping)
   481  		case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
   482  			e.mapping = getUCM(e.mapping)
   483  		}
   484  
   485  		asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
   486  		if asciiSuperset {
   487  			low = 0x80
   488  		}
   489  		lvn := 1
   490  		if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
   491  			lvn = 3
   492  		}
   493  		lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
   494  		printf("// %s is the %s encoding.\n", varName, e.name)
   495  		if e.comment != "" {
   496  			printf("//\n// %s\n", e.comment)
   497  		}
   498  		printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n",
   499  			varName, lowerVarName, lowerVarName, e.name)
   500  		if mibs[e.mib] {
   501  			log.Fatalf("MIB type %q declared multiple times.", e.mib)
   502  		}
   503  		printf("mib: identifier.%s,\n", e.mib)
   504  		printf("asciiSuperset: %t,\n", asciiSuperset)
   505  		printf("low: 0x%02x,\n", low)
   506  		printf("replacement: 0x%02x,\n", e.replacement)
   507  
   508  		printf("decode: [256]utf8Enc{\n")
   509  		i, backMapping := 0, map[rune]byte{}
   510  		for _, c := range e.mapping {
   511  			if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
   512  				backMapping[c] = byte(i)
   513  			}
   514  			var buf [8]byte
   515  			n := utf8.EncodeRune(buf[:], c)
   516  			if n > 3 {
   517  				panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
   518  			}
   519  			printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
   520  			if i%2 == 1 {
   521  				printf("\n")
   522  			}
   523  			i++
   524  		}
   525  		printf("},\n")
   526  
   527  		printf("encode: [256]uint32{\n")
   528  		encode := make([]uint32, 0, 256)
   529  		for c, i := range backMapping {
   530  			encode = append(encode, uint32(i)<<24|uint32(c))
   531  		}
   532  		sort.Sort(byRune(encode))
   533  		for len(encode) < cap(encode) {
   534  			encode = append(encode, encode[len(encode)-1])
   535  		}
   536  		for i, enc := range encode {
   537  			printf("0x%08x,", enc)
   538  			if i%8 == 7 {
   539  				printf("\n")
   540  			}
   541  		}
   542  		printf("},\n}\n")
   543  
   544  		// Add an estimate of the size of a single Charmap{} struct value, which
   545  		// includes two 256 elem arrays of 4 bytes and some extra fields, which
   546  		// align to 3 uint64s on 64-bit architectures.
   547  		w.Size += 2*4*256 + 3*8
   548  	}
   549  	// TODO: add proper line breaking.
   550  	printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
   551  }
   552  
   553  type byRune []uint32
   554  
   555  func (b byRune) Len() int           { return len(b) }
   556  func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
   557  func (b byRune) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }