github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/wpjsontool/render.go (about)

     1  package main
     2  
     3  import (
     4  	"golang.org/x/net/html"
     5  	"golang.org/x/net/html/atom"
     6  	"strings"
     7  )
     8  
     9  var inlineNodes = map[atom.Atom]struct{}{
    10  	atom.A:      {},
    11  	atom.Em:     {},
    12  	atom.Strong: {},
    13  	atom.Small:  {},
    14  	atom.S:      {},
    15  	atom.Cite:   {},
    16  	atom.Q:      {},
    17  	atom.Dfn:    {},
    18  	atom.Abbr:   {},
    19  	// atom.Data
    20  	atom.Time: {},
    21  	atom.Code: {},
    22  	atom.Var:  {},
    23  	atom.Samp: {},
    24  	atom.Kbd:  {},
    25  	atom.Sub:  {},
    26  	atom.Sup:  {},
    27  	atom.I:    {},
    28  	atom.B:    {},
    29  	atom.U:    {},
    30  	atom.Mark: {},
    31  	atom.Ruby: {},
    32  	atom.Rt:   {},
    33  	atom.Rp:   {},
    34  	atom.Bdi:  {},
    35  	atom.Bdo:  {},
    36  	atom.Span: {},
    37  	//	atom.Br:   {},
    38  	atom.Wbr: {},
    39  	atom.Ins: {},
    40  	atom.Del: {},
    41  }
    42  
    43  // NodeToText renders HTML as text, using linebreaks for block elements
    44  func NodeToText(n *html.Node) string {
    45  	if n.Type == html.TextNode {
    46  		return n.Data
    47  	}
    48  
    49  	inline := false
    50  
    51  	if n.Type == html.ElementNode {
    52  		_, inline = inlineNodes[n.DataAtom]
    53  		// special case for some structural elements
    54  		if n.DataAtom == atom.Html || n.DataAtom == atom.Head || n.DataAtom == atom.Body {
    55  			inline = true
    56  		}
    57  	}
    58  
    59  	txt := ""
    60  	for child := n.FirstChild; child != nil; child = child.NextSibling {
    61  		txt += NodeToText(child)
    62  	}
    63  
    64  	if !inline {
    65  		txt += "\n"
    66  	}
    67  
    68  	return txt
    69  }
    70  
    71  // HTMLToText converts html into text, with an attempt to make it
    72  // look nice by plonking line breaks between block elements.
    73  func HTMLToText(h string) (string, error) {
    74  	bod := &html.Node{
    75  		Type:     html.ElementNode,
    76  		Data:     "body",
    77  		DataAtom: atom.Body,
    78  	}
    79  
    80  	nodes, err := html.ParseFragment(strings.NewReader(h), bod)
    81  	if err != nil {
    82  		return "", err
    83  	}
    84  
    85  	var txt string
    86  	for _, n := range nodes {
    87  		txt = txt + NodeToText(n)
    88  	}
    89  
    90  	return txt, nil
    91  }