github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/wpjsontool/render.go (about) 1 package main 2 3 import ( 4 "golang.org/x/net/html" 5 "golang.org/x/net/html/atom" 6 "strings" 7 ) 8 9 var inlineNodes = map[atom.Atom]struct{}{ 10 atom.A: {}, 11 atom.Em: {}, 12 atom.Strong: {}, 13 atom.Small: {}, 14 atom.S: {}, 15 atom.Cite: {}, 16 atom.Q: {}, 17 atom.Dfn: {}, 18 atom.Abbr: {}, 19 // atom.Data 20 atom.Time: {}, 21 atom.Code: {}, 22 atom.Var: {}, 23 atom.Samp: {}, 24 atom.Kbd: {}, 25 atom.Sub: {}, 26 atom.Sup: {}, 27 atom.I: {}, 28 atom.B: {}, 29 atom.U: {}, 30 atom.Mark: {}, 31 atom.Ruby: {}, 32 atom.Rt: {}, 33 atom.Rp: {}, 34 atom.Bdi: {}, 35 atom.Bdo: {}, 36 atom.Span: {}, 37 // atom.Br: {}, 38 atom.Wbr: {}, 39 atom.Ins: {}, 40 atom.Del: {}, 41 } 42 43 // NodeToText renders HTML as text, using linebreaks for block elements 44 func NodeToText(n *html.Node) string { 45 if n.Type == html.TextNode { 46 return n.Data 47 } 48 49 inline := false 50 51 if n.Type == html.ElementNode { 52 _, inline = inlineNodes[n.DataAtom] 53 // special case for some structural elements 54 if n.DataAtom == atom.Html || n.DataAtom == atom.Head || n.DataAtom == atom.Body { 55 inline = true 56 } 57 } 58 59 txt := "" 60 for child := n.FirstChild; child != nil; child = child.NextSibling { 61 txt += NodeToText(child) 62 } 63 64 if !inline { 65 txt += "\n" 66 } 67 68 return txt 69 } 70 71 // HTMLToText converts html into text, with an attempt to make it 72 // look nice by plonking line breaks between block elements. 73 func HTMLToText(h string) (string, error) { 74 bod := &html.Node{ 75 Type: html.ElementNode, 76 Data: "body", 77 DataAtom: atom.Body, 78 } 79 80 nodes, err := html.ParseFragment(strings.NewReader(h), bod) 81 if err != nil { 82 return "", err 83 } 84 85 var txt string 86 for _, n := range nodes { 87 txt = txt + NodeToText(n) 88 } 89 90 return txt, nil 91 }