github.com/elliott5/community@v0.14.1-0.20160709191136-823126fb026a/wordsmith/utility/html.go (about) 1 // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved. 2 // 3 // This software (Documize Community Edition) is licensed under 4 // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html 5 // 6 // You can operate outside the AGPL restrictions by purchasing 7 // Documize Enterprise Edition and obtaining a commercial license 8 // by contacting <sales@documize.com>. 9 // 10 // https://documize.com 11 12 package utility 13 14 import ( 15 "bytes" 16 "fmt" 17 "io" 18 "strings" 19 "unicode/utf8" 20 21 "golang.org/x/net/html" 22 "golang.org/x/net/html/atom" 23 24 "github.com/documize/community/wordsmith/log" 25 ) 26 27 // HTML describes a chunk of HTML, Text() method returns plain text. 28 type HTML string 29 30 // write out the textual element of the html node, if present, then iterate through the child nodes. 31 func writeText(n *html.Node, b io.Writer, isTest bool) { 32 if !excluded(n) { 33 switch n.Type { 34 case html.TextNode: 35 _, err := b.Write([]byte(n.Data + string(rune(0x200B)))) // + http://en.wikipedia.org/wiki/Zero-width_space 36 if err != nil { 37 log.Error("write TextNode", err) 38 } 39 // TODO This use of zero-width-space (subsequently replaced by ' ' or ignored, depending on context) 40 // TODO works well for in-word breaks, but at the expense of concatenating some words in error. 41 // TODO It may be that better examination of the HTML structure could be used to determine 42 // TODO when a space is, or is not, required. In that event we would not use zero-width-space. 43 44 default: 45 for c := n.FirstChild; c != nil; c = c.NextSibling { 46 writeText(c, b, isTest) 47 } 48 switch n.DataAtom { 49 case 0: 50 if n.Data == "documize" { 51 for _, a := range n.Attr { 52 if a.Key == "type" { 53 if isTest { 54 var err error 55 switch a.Val { 56 case "field-start": 57 _, err = b.Write([]byte(" [ ")) 58 case "field-end": 59 _, err = b.Write([]byte(" ] ")) 60 default: 61 _, err = b.Write([]byte(" [ ] ")) 62 } 63 if err != nil { 64 log.Error("write []", err) 65 } 66 } 67 return 68 } 69 } 70 } 71 case atom.Span, atom.U, atom.B, atom.I, atom.Del, atom.Sub, atom.Sup: 72 //NoOp 73 default: 74 _, err := b.Write([]byte(" ")) // add a space after each main element 75 if err != nil { 76 log.Error("write space", err) 77 } 78 } 79 } 80 } 81 } 82 83 func excluded(n *html.Node) bool { 84 if n.DataAtom == atom.Div { 85 for _, a := range n.Attr { 86 if a.Key == "class" { 87 switch a.Val { 88 case "documize-first-page", 89 "documize-exotic-image", 90 "documize-footnote", 91 "documize-graphictext", 92 "documize-math": 93 return true 94 } 95 } 96 } 97 } 98 return false 99 } 100 101 // findBody finds the body HTML node if it exists in the tree. Required to bypass the page title text. 102 func findBody(n *html.Node) *html.Node { 103 if n.DataAtom == atom.Body { 104 return n 105 } 106 for c := n.FirstChild; c != nil; c = c.NextSibling { 107 r := findBody(c) 108 if r != nil { 109 return r 110 } 111 } 112 return nil 113 } 114 115 // Text returns only the plain text elements of the HTML Chunk, concatanated with "\n", 116 // for use in the TOC or for text indexing. 117 func (ch HTML) Text(isTest bool) (string, error) { 118 var b bytes.Buffer 119 doc, err := html.Parse(strings.NewReader(string(ch))) 120 if err != nil { 121 return "", err 122 } 123 body := findBody(doc) 124 if body == nil { 125 body = doc 126 } 127 writeText(body, &b, isTest) 128 return string(b.Bytes()), nil 129 } 130 131 // EscapeHTMLcomplexChars looks for "complex" characters within HTML 132 // and replaces them with the HTML escape codes which describe them. 133 // "Complex" characters are those encoded in more than one byte by UTF8. 134 func EscapeHTMLcomplexChars(s string) string { 135 ret := "" 136 for _, r := range s { 137 if utf8.RuneLen(r) > 1 { 138 ret += fmt.Sprintf("&#%d;", r) 139 } else { 140 ret += string(r) 141 } 142 } 143 return ret 144 } 145 146 // EscapeHTMLcomplexCharsByte looks for "complex" characters within HTML 147 // and replaces them with the HTML escape codes which describe them. 148 // "Complex" characters are those encoded in more than one byte by UTF8. 149 func EscapeHTMLcomplexCharsByte(b []byte) []byte { 150 var ret bytes.Buffer 151 for len(b) > 0 { 152 r, size := utf8.DecodeRune(b) 153 if utf8.RuneLen(r) > 1 { 154 fmt.Fprintf(&ret, "&#%d;", r) 155 } else { 156 _, err := ret.Write(b[:size]) 157 if err != nil { 158 log.Error("EscapeHTMLcomplexCharsByte", err) 159 } 160 } 161 b = b[size:] 162 } 163 return ret.Bytes() 164 }