github.com/vugu/vugu@v0.3.6-0.20240430171613-3f6f402e014b/internal/htmlx/doctype.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package htmlx
     6  
     7  import (
     8  	"strings"
     9  )
    10  
    11  // parseDoctype parses the data from a DoctypeToken into a name,
    12  // public identifier, and system identifier. It returns a Node whose Type
    13  // is DoctypeNode, whose Data is the name, and which has attributes
    14  // named "system" and "public" for the two identifiers if they were present.
    15  // quirks is whether the document should be parsed in "quirks mode".
    16  func parseDoctype(s string) (n *Node, quirks bool) {
    17  	n = &Node{Type: DoctypeNode}
    18  
    19  	// Find the name.
    20  	space := strings.IndexAny(s, whitespace)
    21  	if space == -1 {
    22  		space = len(s)
    23  	}
    24  	n.Data = s[:space]
    25  	// The comparison to "html" is case-sensitive.
    26  	if n.Data != "html" {
    27  		quirks = true
    28  	}
    29  	n.Data = strings.ToLower(n.Data)
    30  	s = strings.TrimLeft(s[space:], whitespace)
    31  
    32  	if len(s) < 6 {
    33  		// It can't start with "PUBLIC" or "SYSTEM".
    34  		// Ignore the rest of the string.
    35  		return n, quirks || s != ""
    36  	}
    37  
    38  	key := strings.ToLower(s[:6])
    39  	s = s[6:]
    40  	for key == "public" || key == "system" {
    41  		s = strings.TrimLeft(s, whitespace)
    42  		if s == "" {
    43  			break
    44  		}
    45  		quote := s[0]
    46  		if quote != '"' && quote != '\'' {
    47  			break
    48  		}
    49  		s = s[1:]
    50  		q := strings.IndexRune(s, rune(quote))
    51  		var id string
    52  		if q == -1 {
    53  			id = s
    54  			s = ""
    55  		} else {
    56  			id = s[:q]
    57  			s = s[q+1:]
    58  		}
    59  		n.Attr = append(n.Attr, Attribute{Key: key, Val: id})
    60  		if key == "public" {
    61  			key = "system"
    62  		} else {
    63  			key = ""
    64  		}
    65  	}
    66  
    67  	if key != "" || s != "" {
    68  		quirks = true
    69  	} else if len(n.Attr) > 0 {
    70  		if n.Attr[0].Key == "public" {
    71  			public := strings.ToLower(n.Attr[0].Val)
    72  			switch public {
    73  			case "-//w3o//dtd w3 html strict 3.0//en//", "-/w3d/dtd html 4.0 transitional/en", "html":
    74  				quirks = true
    75  			default:
    76  				for _, q := range quirkyIDs {
    77  					if strings.HasPrefix(public, q) {
    78  						quirks = true
    79  						break
    80  					}
    81  				}
    82  			}
    83  			// The following two public IDs only cause quirks mode if there is no system ID.
    84  			if len(n.Attr) == 1 && (strings.HasPrefix(public, "-//w3c//dtd html 4.01 frameset//") ||
    85  				strings.HasPrefix(public, "-//w3c//dtd html 4.01 transitional//")) {
    86  				quirks = true
    87  			}
    88  		}
    89  		if lastAttr := n.Attr[len(n.Attr)-1]; lastAttr.Key == "system" &&
    90  			strings.ToLower(lastAttr.Val) == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" {
    91  			quirks = true
    92  		}
    93  	}
    94  
    95  	return n, quirks
    96  }
    97  
    98  // quirkyIDs is a list of public doctype identifiers that cause a document
    99  // to be interpreted in quirks mode. The identifiers should be in lower case.
   100  var quirkyIDs = []string{
   101  	"+//silmaril//dtd html pro v0r11 19970101//",
   102  	"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
   103  	"-//as//dtd html 3.0 aswedit + extensions//",
   104  	"-//ietf//dtd html 2.0 level 1//",
   105  	"-//ietf//dtd html 2.0 level 2//",
   106  	"-//ietf//dtd html 2.0 strict level 1//",
   107  	"-//ietf//dtd html 2.0 strict level 2//",
   108  	"-//ietf//dtd html 2.0 strict//",
   109  	"-//ietf//dtd html 2.0//",
   110  	"-//ietf//dtd html 2.1e//",
   111  	"-//ietf//dtd html 3.0//",
   112  	"-//ietf//dtd html 3.2 final//",
   113  	"-//ietf//dtd html 3.2//",
   114  	"-//ietf//dtd html 3//",
   115  	"-//ietf//dtd html level 0//",
   116  	"-//ietf//dtd html level 1//",
   117  	"-//ietf//dtd html level 2//",
   118  	"-//ietf//dtd html level 3//",
   119  	"-//ietf//dtd html strict level 0//",
   120  	"-//ietf//dtd html strict level 1//",
   121  	"-//ietf//dtd html strict level 2//",
   122  	"-//ietf//dtd html strict level 3//",
   123  	"-//ietf//dtd html strict//",
   124  	"-//ietf//dtd html//",
   125  	"-//metrius//dtd metrius presentational//",
   126  	"-//microsoft//dtd internet explorer 2.0 html strict//",
   127  	"-//microsoft//dtd internet explorer 2.0 html//",
   128  	"-//microsoft//dtd internet explorer 2.0 tables//",
   129  	"-//microsoft//dtd internet explorer 3.0 html strict//",
   130  	"-//microsoft//dtd internet explorer 3.0 html//",
   131  	"-//microsoft//dtd internet explorer 3.0 tables//",
   132  	"-//netscape comm. corp.//dtd html//",
   133  	"-//netscape comm. corp.//dtd strict html//",
   134  	"-//o'reilly and associates//dtd html 2.0//",
   135  	"-//o'reilly and associates//dtd html extended 1.0//",
   136  	"-//o'reilly and associates//dtd html extended relaxed 1.0//",
   137  	"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
   138  	"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
   139  	"-//spyglass//dtd html 2.0 extended//",
   140  	"-//sq//dtd html 2.0 hotmetal + extensions//",
   141  	"-//sun microsystems corp.//dtd hotjava html//",
   142  	"-//sun microsystems corp.//dtd hotjava strict html//",
   143  	"-//w3c//dtd html 3 1995-03-24//",
   144  	"-//w3c//dtd html 3.2 draft//",
   145  	"-//w3c//dtd html 3.2 final//",
   146  	"-//w3c//dtd html 3.2//",
   147  	"-//w3c//dtd html 3.2s draft//",
   148  	"-//w3c//dtd html 4.0 frameset//",
   149  	"-//w3c//dtd html 4.0 transitional//",
   150  	"-//w3c//dtd html experimental 19960712//",
   151  	"-//w3c//dtd html experimental 970421//",
   152  	"-//w3c//dtd w3 html//",
   153  	"-//w3o//dtd w3 html 3.0//",
   154  	"-//webtechs//dtd mozilla html 2.0//",
   155  	"-//webtechs//dtd mozilla html//",
   156  }