github.com/vugu/vugu@v0.3.6-0.20240430171613-3f6f402e014b/internal/htmlx/atom/gen.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  // +build ignore
     7  
     8  //go:generate go run gen.go
     9  //go:generate go run gen.go -test
    10  
    11  package main
    12  
    13  import (
    14  	"bytes"
    15  	"flag"
    16  	"fmt"
    17  	"go/format"
    18  	"math/rand"
    19  	"os"
    20  	"sort"
    21  	"strings"
    22  )
    23  
    24  // identifier converts s to a Go exported identifier.
    25  // It converts "div" to "Div" and "accept-charset" to "AcceptCharset".
    26  func identifier(s string) string {
    27  	b := make([]byte, 0, len(s))
    28  	cap := true
    29  	for _, c := range s {
    30  		if c == '-' {
    31  			cap = true
    32  			continue
    33  		}
    34  		if cap && 'a' <= c && c <= 'z' {
    35  			c -= 'a' - 'A'
    36  		}
    37  		cap = false
    38  		b = append(b, byte(c))
    39  	}
    40  	return string(b)
    41  }
    42  
    43  var test = flag.Bool("test", false, "generate table_test.go")
    44  
    45  func genFile(name string, buf *bytes.Buffer) {
    46  	b, err := format.Source(buf.Bytes())
    47  	if err != nil {
    48  		fmt.Fprintln(os.Stderr, err)
    49  		os.Exit(1)
    50  	}
    51  	if err := os.WriteFile(name, b, 0644); err != nil {
    52  		fmt.Fprintln(os.Stderr, err)
    53  		os.Exit(1)
    54  	}
    55  }
    56  
    57  func main() {
    58  	flag.Parse()
    59  
    60  	var all []string
    61  	all = append(all, elements...)
    62  	all = append(all, attributes...)
    63  	all = append(all, eventHandlers...)
    64  	all = append(all, extra...)
    65  	sort.Strings(all)
    66  
    67  	// uniq - lists have dups
    68  	w := 0
    69  	for _, s := range all {
    70  		if w == 0 || all[w-1] != s {
    71  			all[w] = s
    72  			w++
    73  		}
    74  	}
    75  	all = all[:w]
    76  
    77  	if *test {
    78  		var buf bytes.Buffer
    79  		fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n")
    80  		fmt.Fprintln(&buf, "//go:generate go run gen.go -test\n")
    81  		fmt.Fprintln(&buf, "package atom\n")
    82  		fmt.Fprintln(&buf, "var testAtomList = []string{")
    83  		for _, s := range all {
    84  			fmt.Fprintf(&buf, "\t%q,\n", s)
    85  		}
    86  		fmt.Fprintln(&buf, "}")
    87  
    88  		genFile("table_test.go", &buf)
    89  		return
    90  	}
    91  
    92  	// Find hash that minimizes table size.
    93  	var best *table
    94  	for i := 0; i < 1000000; i++ {
    95  		if best != nil && 1<<(best.k-1) < len(all) {
    96  			break
    97  		}
    98  		h := rand.Uint32()
    99  		for k := uint(0); k <= 16; k++ {
   100  			if best != nil && k >= best.k {
   101  				break
   102  			}
   103  			var t table
   104  			if t.init(h, k, all) {
   105  				best = &t
   106  				break
   107  			}
   108  		}
   109  	}
   110  	if best == nil {
   111  		fmt.Fprintf(os.Stderr, "failed to construct string table\n")
   112  		os.Exit(1)
   113  	}
   114  
   115  	// Lay out strings, using overlaps when possible.
   116  	layout := append([]string{}, all...)
   117  
   118  	// Remove strings that are substrings of other strings
   119  	for changed := true; changed; {
   120  		changed = false
   121  		for i, s := range layout {
   122  			if s == "" {
   123  				continue
   124  			}
   125  			for j, t := range layout {
   126  				if i != j && t != "" && strings.Contains(s, t) {
   127  					changed = true
   128  					layout[j] = ""
   129  				}
   130  			}
   131  		}
   132  	}
   133  
   134  	// Join strings where one suffix matches another prefix.
   135  	for {
   136  		// Find best i, j, k such that layout[i][len-k:] == layout[j][:k],
   137  		// maximizing overlap length k.
   138  		besti := -1
   139  		bestj := -1
   140  		bestk := 0
   141  		for i, s := range layout {
   142  			if s == "" {
   143  				continue
   144  			}
   145  			for j, t := range layout {
   146  				if i == j {
   147  					continue
   148  				}
   149  				for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
   150  					if s[len(s)-k:] == t[:k] {
   151  						besti = i
   152  						bestj = j
   153  						bestk = k
   154  					}
   155  				}
   156  			}
   157  		}
   158  		if bestk > 0 {
   159  			layout[besti] += layout[bestj][bestk:]
   160  			layout[bestj] = ""
   161  			continue
   162  		}
   163  		break
   164  	}
   165  
   166  	text := strings.Join(layout, "")
   167  
   168  	atom := map[string]uint32{}
   169  	for _, s := range all {
   170  		off := strings.Index(text, s)
   171  		if off < 0 {
   172  			panic("lost string " + s)
   173  		}
   174  		atom[s] = uint32(off<<8 | len(s))
   175  	}
   176  
   177  	var buf bytes.Buffer
   178  	// Generate the Go code.
   179  	fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n")
   180  	fmt.Fprintln(&buf, "//go:generate go run gen.go\n")
   181  	fmt.Fprintln(&buf, "package atom\n\nconst (")
   182  
   183  	// compute max len
   184  	maxLen := 0
   185  	for _, s := range all {
   186  		if maxLen < len(s) {
   187  			maxLen = len(s)
   188  		}
   189  		fmt.Fprintf(&buf, "\t%s Atom = %#x\n", identifier(s), atom[s])
   190  	}
   191  	fmt.Fprintln(&buf, ")\n")
   192  
   193  	fmt.Fprintf(&buf, "const hash0 = %#x\n\n", best.h0)
   194  	fmt.Fprintf(&buf, "const maxAtomLen = %d\n\n", maxLen)
   195  
   196  	fmt.Fprintf(&buf, "var table = [1<<%d]Atom{\n", best.k)
   197  	for i, s := range best.tab {
   198  		if s == "" {
   199  			continue
   200  		}
   201  		fmt.Fprintf(&buf, "\t%#x: %#x, // %s\n", i, atom[s], s)
   202  	}
   203  	fmt.Fprintf(&buf, "}\n")
   204  	datasize := (1 << best.k) * 4
   205  
   206  	fmt.Fprintln(&buf, "const atomText =")
   207  	textsize := len(text)
   208  	for len(text) > 60 {
   209  		fmt.Fprintf(&buf, "\t%q +\n", text[:60])
   210  		text = text[60:]
   211  	}
   212  	fmt.Fprintf(&buf, "\t%q\n\n", text)
   213  
   214  	genFile("table.go", &buf)
   215  
   216  	fmt.Fprintf(os.Stdout, "%d atoms; %d string bytes + %d tables = %d total data\n", len(all), textsize, datasize, textsize+datasize)
   217  }
   218  
   219  type byLen []string
   220  
   221  func (x byLen) Less(i, j int) bool { return len(x[i]) > len(x[j]) }
   222  func (x byLen) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
   223  func (x byLen) Len() int           { return len(x) }
   224  
   225  // fnv computes the FNV hash with an arbitrary starting value h.
   226  func fnv(h uint32, s string) uint32 {
   227  	for i := 0; i < len(s); i++ {
   228  		h ^= uint32(s[i])
   229  		h *= 16777619
   230  	}
   231  	return h
   232  }
   233  
   234  // A table represents an attempt at constructing the lookup table.
   235  // The lookup table uses cuckoo hashing, meaning that each string
   236  // can be found in one of two positions.
   237  type table struct {
   238  	h0   uint32
   239  	k    uint
   240  	mask uint32
   241  	tab  []string
   242  }
   243  
   244  // hash returns the two hashes for s.
   245  func (t *table) hash(s string) (h1, h2 uint32) {
   246  	h := fnv(t.h0, s)
   247  	h1 = h & t.mask
   248  	h2 = (h >> 16) & t.mask
   249  	return
   250  }
   251  
   252  // init initializes the table with the given parameters.
   253  // h0 is the initial hash value,
   254  // k is the number of bits of hash value to use, and
   255  // x is the list of strings to store in the table.
   256  // init returns false if the table cannot be constructed.
   257  func (t *table) init(h0 uint32, k uint, x []string) bool {
   258  	t.h0 = h0
   259  	t.k = k
   260  	t.tab = make([]string, 1<<k)
   261  	t.mask = 1<<k - 1
   262  	for _, s := range x {
   263  		if !t.insert(s) {
   264  			return false
   265  		}
   266  	}
   267  	return true
   268  }
   269  
   270  // insert inserts s in the table.
   271  func (t *table) insert(s string) bool {
   272  	h1, h2 := t.hash(s)
   273  	if t.tab[h1] == "" {
   274  		t.tab[h1] = s
   275  		return true
   276  	}
   277  	if t.tab[h2] == "" {
   278  		t.tab[h2] = s
   279  		return true
   280  	}
   281  	if t.push(h1, 0) {
   282  		t.tab[h1] = s
   283  		return true
   284  	}
   285  	if t.push(h2, 0) {
   286  		t.tab[h2] = s
   287  		return true
   288  	}
   289  	return false
   290  }
   291  
   292  // push attempts to push aside the entry in slot i.
   293  func (t *table) push(i uint32, depth int) bool {
   294  	if depth > len(t.tab) {
   295  		return false
   296  	}
   297  	s := t.tab[i]
   298  	h1, h2 := t.hash(s)
   299  	j := h1 + h2 - i
   300  	if t.tab[j] != "" && !t.push(j, depth+1) {
   301  		return false
   302  	}
   303  	t.tab[j] = s
   304  	return true
   305  }
   306  
   307  // The lists of element names and attribute keys were taken from
   308  // https://html.spec.whatwg.org/multipage/indices.html#index
   309  // as of the "HTML Living Standard - Last Updated 16 April 2018" version.
   310  
   311  // "command", "keygen" and "menuitem" have been removed from the spec,
   312  // but are kept here for backwards compatibility.
   313  var elements = []string{
   314  	"a",
   315  	"abbr",
   316  	"address",
   317  	"area",
   318  	"article",
   319  	"aside",
   320  	"audio",
   321  	"b",
   322  	"base",
   323  	"bdi",
   324  	"bdo",
   325  	"blockquote",
   326  	"body",
   327  	"br",
   328  	"button",
   329  	"canvas",
   330  	"caption",
   331  	"cite",
   332  	"code",
   333  	"col",
   334  	"colgroup",
   335  	"command",
   336  	"data",
   337  	"datalist",
   338  	"dd",
   339  	"del",
   340  	"details",
   341  	"dfn",
   342  	"dialog",
   343  	"div",
   344  	"dl",
   345  	"dt",
   346  	"em",
   347  	"embed",
   348  	"fieldset",
   349  	"figcaption",
   350  	"figure",
   351  	"footer",
   352  	"form",
   353  	"h1",
   354  	"h2",
   355  	"h3",
   356  	"h4",
   357  	"h5",
   358  	"h6",
   359  	"head",
   360  	"header",
   361  	"hgroup",
   362  	"hr",
   363  	"html",
   364  	"i",
   365  	"iframe",
   366  	"img",
   367  	"input",
   368  	"ins",
   369  	"kbd",
   370  	"keygen",
   371  	"label",
   372  	"legend",
   373  	"li",
   374  	"link",
   375  	"main",
   376  	"map",
   377  	"mark",
   378  	"menu",
   379  	"menuitem",
   380  	"meta",
   381  	"meter",
   382  	"nav",
   383  	"noscript",
   384  	"object",
   385  	"ol",
   386  	"optgroup",
   387  	"option",
   388  	"output",
   389  	"p",
   390  	"param",
   391  	"picture",
   392  	"pre",
   393  	"progress",
   394  	"q",
   395  	"rp",
   396  	"rt",
   397  	"ruby",
   398  	"s",
   399  	"samp",
   400  	"script",
   401  	"section",
   402  	"select",
   403  	"slot",
   404  	"small",
   405  	"source",
   406  	"span",
   407  	"strong",
   408  	"style",
   409  	"sub",
   410  	"summary",
   411  	"sup",
   412  	"table",
   413  	"tbody",
   414  	"td",
   415  	"template",
   416  	"textarea",
   417  	"tfoot",
   418  	"th",
   419  	"thead",
   420  	"time",
   421  	"title",
   422  	"tr",
   423  	"track",
   424  	"u",
   425  	"ul",
   426  	"var",
   427  	"video",
   428  	"wbr",
   429  }
   430  
   431  // https://html.spec.whatwg.org/multipage/indices.html#attributes-3
   432  //
   433  // "challenge", "command", "contextmenu", "dropzone", "icon", "keytype", "mediagroup",
   434  // "radiogroup", "spellcheck", "scoped", "seamless", "sortable" and "sorted" have been removed from the spec,
   435  // but are kept here for backwards compatibility.
   436  var attributes = []string{
   437  	"abbr",
   438  	"accept",
   439  	"accept-charset",
   440  	"accesskey",
   441  	"action",
   442  	"allowfullscreen",
   443  	"allowpaymentrequest",
   444  	"allowusermedia",
   445  	"alt",
   446  	"as",
   447  	"async",
   448  	"autocomplete",
   449  	"autofocus",
   450  	"autoplay",
   451  	"challenge",
   452  	"charset",
   453  	"checked",
   454  	"cite",
   455  	"class",
   456  	"color",
   457  	"cols",
   458  	"colspan",
   459  	"command",
   460  	"content",
   461  	"contenteditable",
   462  	"contextmenu",
   463  	"controls",
   464  	"coords",
   465  	"crossorigin",
   466  	"data",
   467  	"datetime",
   468  	"default",
   469  	"defer",
   470  	"dir",
   471  	"dirname",
   472  	"disabled",
   473  	"download",
   474  	"draggable",
   475  	"dropzone",
   476  	"enctype",
   477  	"for",
   478  	"form",
   479  	"formaction",
   480  	"formenctype",
   481  	"formmethod",
   482  	"formnovalidate",
   483  	"formtarget",
   484  	"headers",
   485  	"height",
   486  	"hidden",
   487  	"high",
   488  	"href",
   489  	"hreflang",
   490  	"http-equiv",
   491  	"icon",
   492  	"id",
   493  	"inputmode",
   494  	"integrity",
   495  	"is",
   496  	"ismap",
   497  	"itemid",
   498  	"itemprop",
   499  	"itemref",
   500  	"itemscope",
   501  	"itemtype",
   502  	"keytype",
   503  	"kind",
   504  	"label",
   505  	"lang",
   506  	"list",
   507  	"loop",
   508  	"low",
   509  	"manifest",
   510  	"max",
   511  	"maxlength",
   512  	"media",
   513  	"mediagroup",
   514  	"method",
   515  	"min",
   516  	"minlength",
   517  	"multiple",
   518  	"muted",
   519  	"name",
   520  	"nomodule",
   521  	"nonce",
   522  	"novalidate",
   523  	"open",
   524  	"optimum",
   525  	"pattern",
   526  	"ping",
   527  	"placeholder",
   528  	"playsinline",
   529  	"poster",
   530  	"preload",
   531  	"radiogroup",
   532  	"readonly",
   533  	"referrerpolicy",
   534  	"rel",
   535  	"required",
   536  	"reversed",
   537  	"rows",
   538  	"rowspan",
   539  	"sandbox",
   540  	"spellcheck",
   541  	"scope",
   542  	"scoped",
   543  	"seamless",
   544  	"selected",
   545  	"shape",
   546  	"size",
   547  	"sizes",
   548  	"sortable",
   549  	"sorted",
   550  	"slot",
   551  	"span",
   552  	"spellcheck",
   553  	"src",
   554  	"srcdoc",
   555  	"srclang",
   556  	"srcset",
   557  	"start",
   558  	"step",
   559  	"style",
   560  	"tabindex",
   561  	"target",
   562  	"title",
   563  	"translate",
   564  	"type",
   565  	"typemustmatch",
   566  	"updateviacache",
   567  	"usemap",
   568  	"value",
   569  	"width",
   570  	"workertype",
   571  	"wrap",
   572  }
   573  
   574  // "onautocomplete", "onautocompleteerror", "onmousewheel",
   575  // "onshow" and "onsort" have been removed from the spec,
   576  // but are kept here for backwards compatibility.
   577  var eventHandlers = []string{
   578  	"onabort",
   579  	"onautocomplete",
   580  	"onautocompleteerror",
   581  	"onauxclick",
   582  	"onafterprint",
   583  	"onbeforeprint",
   584  	"onbeforeunload",
   585  	"onblur",
   586  	"oncancel",
   587  	"oncanplay",
   588  	"oncanplaythrough",
   589  	"onchange",
   590  	"onclick",
   591  	"onclose",
   592  	"oncontextmenu",
   593  	"oncopy",
   594  	"oncuechange",
   595  	"oncut",
   596  	"ondblclick",
   597  	"ondrag",
   598  	"ondragend",
   599  	"ondragenter",
   600  	"ondragexit",
   601  	"ondragleave",
   602  	"ondragover",
   603  	"ondragstart",
   604  	"ondrop",
   605  	"ondurationchange",
   606  	"onemptied",
   607  	"onended",
   608  	"onerror",
   609  	"onfocus",
   610  	"onhashchange",
   611  	"oninput",
   612  	"oninvalid",
   613  	"onkeydown",
   614  	"onkeypress",
   615  	"onkeyup",
   616  	"onlanguagechange",
   617  	"onload",
   618  	"onloadeddata",
   619  	"onloadedmetadata",
   620  	"onloadend",
   621  	"onloadstart",
   622  	"onmessage",
   623  	"onmessageerror",
   624  	"onmousedown",
   625  	"onmouseenter",
   626  	"onmouseleave",
   627  	"onmousemove",
   628  	"onmouseout",
   629  	"onmouseover",
   630  	"onmouseup",
   631  	"onmousewheel",
   632  	"onwheel",
   633  	"onoffline",
   634  	"ononline",
   635  	"onpagehide",
   636  	"onpageshow",
   637  	"onpaste",
   638  	"onpause",
   639  	"onplay",
   640  	"onplaying",
   641  	"onpopstate",
   642  	"onprogress",
   643  	"onratechange",
   644  	"onreset",
   645  	"onresize",
   646  	"onrejectionhandled",
   647  	"onscroll",
   648  	"onsecuritypolicyviolation",
   649  	"onseeked",
   650  	"onseeking",
   651  	"onselect",
   652  	"onshow",
   653  	"onsort",
   654  	"onstalled",
   655  	"onstorage",
   656  	"onsubmit",
   657  	"onsuspend",
   658  	"ontimeupdate",
   659  	"ontoggle",
   660  	"onunhandledrejection",
   661  	"onunload",
   662  	"onvolumechange",
   663  	"onwaiting",
   664  }
   665  
   666  // extra are ad-hoc values not covered by any of the lists above.
   667  var extra = []string{
   668  	"acronym",
   669  	"align",
   670  	"annotation",
   671  	"annotation-xml",
   672  	"applet",
   673  	"basefont",
   674  	"bgsound",
   675  	"big",
   676  	"blink",
   677  	"center",
   678  	"color",
   679  	"desc",
   680  	"face",
   681  	"font",
   682  	"foreignObject", // HTML is case-insensitive, but SVG-embedded-in-HTML is case-sensitive.
   683  	"foreignobject",
   684  	"frame",
   685  	"frameset",
   686  	"image",
   687  	"isindex",
   688  	"listing",
   689  	"malignmark",
   690  	"marquee",
   691  	"math",
   692  	"mglyph",
   693  	"mi",
   694  	"mn",
   695  	"mo",
   696  	"ms",
   697  	"mtext",
   698  	"nobr",
   699  	"noembed",
   700  	"noframes",
   701  	"plaintext",
   702  	"prompt",
   703  	"public",
   704  	"rb",
   705  	"rtc",
   706  	"spacer",
   707  	"strike",
   708  	"svg",
   709  	"system",
   710  	"tt",
   711  	"xmp",
   712  }