github.com/Andyfoo/golang/x/net@v0.0.0-20190901054642-57c1bf301704/html/doc.go

github.com/Andyfoo/golang/x/net@v0.0.0-20190901054642-57c1bf301704/html/doc.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  /*
     6  Package html implements an HTML5-compliant tokenizer and parser.
     7  
     8  Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
     9  caller's responsibility to ensure that r provides UTF-8 encoded HTML.
    10  
    11  	z := html.NewTokenizer(r)
    12  
    13  Given a Tokenizer z, the HTML is tokenized by repeatedly calling z.Next(),
    14  which parses the next token and returns its type, or an error:
    15  
    16  	for {
    17  		tt := z.Next()
    18  		if tt == html.ErrorToken {
    19  			// ...
    20  			return ...
    21  		}
    22  		// Process the current token.
    23  	}
    24  
    25  There are two APIs for retrieving the current token. The high-level API is to
    26  call Token; the low-level API is to call Text or TagName / TagAttr. Both APIs
    27  allow optionally calling Raw after Next but before Token, Text, TagName, or
    28  TagAttr. In EBNF notation, the valid call sequence per token is:
    29  
    30  	Next {Raw} [ Token | Text | TagName {TagAttr} ]
    31  
    32  Token returns an independent data structure that completely describes a token.
    33  Entities (such as "&lt;") are unescaped, tag names and attribute keys are
    34  lower-cased, and attributes are collected into a []Attribute. For example:
    35  
    36  	for {
    37  		if z.Next() == html.ErrorToken {
    38  			// Returning io.EOF indicates success.
    39  			return z.Err()
    40  		}
    41  		emitToken(z.Token())
    42  	}
    43  
    44  The low-level API performs fewer allocations and copies, but the contents of
    45  the []byte values returned by Text, TagName and TagAttr may change on the next
    46  call to Next. For example, to extract an HTML page's anchor text:
    47  
    48  	depth := 0
    49  	for {
    50  		tt := z.Next()
    51  		switch tt {
    52  		case html.ErrorToken:
    53  			return z.Err()
    54  		case html.TextToken:
    55  			if depth > 0 {
    56  				// emitBytes should copy the []byte it receives,
    57  				// if it doesn't process it immediately.
    58  				emitBytes(z.Text())
    59  			}
    60  		case html.StartTagToken, html.EndTagToken:
    61  			tn, _ := z.TagName()
    62  			if len(tn) == 1 && tn[0] == 'a' {
    63  				if tt == html.StartTagToken {
    64  					depth++
    65  				} else {
    66  					depth--
    67  				}
    68  			}
    69  		}
    70  	}
    71  
    72  Parsing is done by calling Parse with an io.Reader, which returns the root of
    73  the parse tree (the document element) as a *Node. It is the caller's
    74  responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
    75  example, to process each anchor node in depth-first order:
    76  
    77  	doc, err := html.Parse(r)
    78  	if err != nil {
    79  		// ...
    80  	}
    81  	var f func(*html.Node)
    82  	f = func(n *html.Node) {
    83  		if n.Type == html.ElementNode && n.Data == "a" {
    84  			// Do something with n...
    85  		}
    86  		for c := n.FirstChild; c != nil; c = c.NextSibling {
    87  			f(c)
    88  		}
    89  	}
    90  	f(doc)
    91  
    92  The relevant specifications include:
    93  https://html.spec.whatwg.org/multipage/syntax.html and
    94  https://html.spec.whatwg.org/multipage/syntax.html#tokenization
    95  */
    96  package html // import "github.com/Andyfoo/golang/x/net/html"
    97  
    98  // The tokenization algorithm implemented by this package is not a line-by-line
    99  // transliteration of the relatively verbose state-machine in the WHATWG
   100  // specification. A more direct approach is used instead, where the program
   101  // counter implies the state, such as whether it is tokenizing a tag or a text
   102  // node. Specification compliance is verified by checking expected and actual
   103  // outputs over a test suite rather than aiming for algorithmic fidelity.
   104  
   105  // TODO(nigeltao): Does a DOM API belong in this package or a separate one?
   106  // TODO(nigeltao): How does parsing interact with a JavaScript engine?