github.com/jancarloviray/community@v0.41.1-0.20170124221257-33a66c87cf2f/core/api/convert/html/html.go (about) 1 // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved. 2 // 3 // This software (Documize Community Edition) is licensed under 4 // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html 5 // 6 // You can operate outside the AGPL restrictions by purchasing 7 // Documize Enterprise Edition and obtaining a commercial license 8 // by contacting <sales@documize.com>. 9 // 10 // https://documize.com 11 12 package html 13 14 import ( 15 "bytes" 16 "fmt" 17 "strings" 18 19 api "github.com/documize/community/core/convapi" 20 "github.com/documize/community/core/log" 21 "github.com/documize/community/core/utility" 22 23 "golang.org/x/net/html" 24 "golang.org/x/net/html/atom" 25 26 "golang.org/x/net/context" 27 ) 28 29 const maxTitle = 2000 // NOTE: must be the same length as database page.title 30 const maxBody = 4000000 // NOTE: must be less than the mysql max_allowed_packet limit, amongst other values 31 32 type htmlToSplit struct { 33 CFR *api.DocumentConversionResponse 34 thisSect api.Page 35 nodeCache map[*html.Node]bool 36 } 37 38 // Convert provides the standard interface for conversion of an HTML document. 39 // All the function does is return a pointer to api.DocumentConversionResponse with 40 // PagesHTML set to the given (*api.DocumentConversionRequest).Filedata - so effectively a no-op. 41 func Convert(ctx context.Context, in interface{}) (interface{}, error) { 42 return &api.DocumentConversionResponse{ 43 PagesHTML: in.(*api.DocumentConversionRequest).Filedata}, nil 44 } 45 46 // SplitIfHTML splits HTML code into pages, if it exists. 47 func SplitIfHTML(req *api.DocumentConversionRequest, res *api.DocumentConversionResponse) error { 48 if len(res.PagesHTML) == 0 { 49 return nil 50 } 51 hd := &htmlToSplit{CFR: res, nodeCache: make(map[*html.Node]bool)} 52 err := hd.testableSplit(req, res) 53 /* 54 for k, v := range hd.CFR.Pages { 55 fmt.Printf("DEBUG hd.CFR.Pages[%d] = Level: %d Title: %s len(Body)=%d\n", 56 k, v.Level, v.Title, len(v.Body)) 57 } 58 */ 59 return err 60 } 61 62 // testableSplit, NOTE pointer receiver so that test code can inspect generated datastructures. 63 func (h *htmlToSplit) testableSplit(request *api.DocumentConversionRequest, 64 response *api.DocumentConversionResponse) error { 65 doc, err := html.Parse(bytes.NewReader(response.PagesHTML)) 66 if err != nil { 67 return err 68 } 69 if doc.Type != html.DocumentNode { 70 return fmt.Errorf("no HTML document node") 71 } 72 for htm := doc.FirstChild; htm != nil; htm = htm.NextSibling { 73 if htm.Type == html.ElementNode && htm.DataAtom == atom.Html { 74 for bdy := htm.FirstChild; bdy != nil; bdy = bdy.NextSibling { 75 if bdy.Type == html.ElementNode && bdy.DataAtom == atom.Body { 76 h.thisSect = api.Page{ 77 Level: 1, 78 Title: utility.BeautifyFilename(request.Filename), 79 Body: []byte(``)} 80 err := h.processChildren(bdy) 81 if err != nil { 82 h.CFR.Err = err.Error() 83 } 84 h.CFR.Pages = append(h.CFR.Pages, h.thisSect) 85 } 86 } 87 } 88 } 89 return nil 90 } 91 92 func getLevel(at atom.Atom) uint64 { 93 level := uint64(1) 94 switch at { 95 case atom.H6: 96 level++ 97 fallthrough 98 case atom.H5: 99 level++ 100 fallthrough 101 case atom.H4: 102 level++ 103 fallthrough 104 case atom.H3: 105 level++ 106 fallthrough 107 case atom.H2: 108 level++ 109 fallthrough 110 case atom.H1: 111 level++ 112 } 113 return level 114 } 115 116 func (h *htmlToSplit) processChildren(bdy *html.Node) error { 117 for c := bdy.FirstChild; c != nil; c = c.NextSibling { 118 var err error 119 if c.Type == html.ElementNode { 120 if level := getLevel(c.DataAtom); level > 1 { 121 err = h.renderHeading(c, level) 122 } else { 123 err = h.renderNonHeading(c) 124 } 125 } else { 126 err = h.renderAppend(c) 127 } 128 if err != nil { 129 return err 130 } 131 } 132 return nil 133 } 134 135 func stripZeroWidthSpaces(str string) string { 136 ret := "" 137 for _, r := range str { 138 if r != 0x200B { // zero width space 139 ret += string(r) // stripped of zero-width spaces 140 } 141 } 142 return ret 143 } 144 145 func (h *htmlToSplit) renderHeading(c *html.Node, level uint64) error { 146 byt, err := byteRenderChildren(c) // get heading html 147 if err != nil { 148 return err 149 } 150 str, err := utility.HTML(string(byt)).Text(false) // heading text 151 if err != nil { 152 return err 153 } 154 str = stripZeroWidthSpaces(str) 155 if strings.TrimSpace(str) != "" { // only put in non-empty headings 156 h.newSect(str, level) 157 } 158 return nil 159 } 160 161 func (h *htmlToSplit) newSect(tstr string, level uint64) { 162 h.CFR.Pages = append(h.CFR.Pages, h.thisSect) 163 title := tstr //was: utility.EscapeHTMLcomplexChars(tstr) -- removed to avoid double-escaping 164 body := `` 165 if len(title) > maxTitle { 166 body = title[maxTitle:] 167 title = title[:maxTitle] 168 } 169 h.thisSect = api.Page{ 170 Level: level, 171 Title: title, 172 Body: []byte(body)} 173 } 174 175 func (h *htmlToSplit) renderNonHeading(c *html.Node) error { 176 if h.nodeContainsHeading(c) { // ignore this atom in order to get at the contents 177 err := h.processChildren(c) 178 if err != nil { 179 return err 180 } 181 } else { 182 if err := h.renderAppend(c); err != nil { 183 return err 184 } 185 } 186 return nil 187 } 188 189 func (h *htmlToSplit) renderAppend(c *html.Node) error { 190 byt, err := byteRender(c) 191 if err != nil { 192 return err 193 } 194 ebyt := utility.EscapeHTMLcomplexCharsByte(byt) 195 if len(ebyt) > maxBody { 196 msg := fmt.Sprintf("(Documize warning: HTML render element ignored, size of %d exceeded maxBody of %d.)", len(ebyt), maxBody) 197 log.Info(msg) 198 ebyt = []byte("<p><b>" + msg + "</b></p>") 199 } 200 if len(h.thisSect.Body)+len(ebyt) > maxBody { 201 h.newSect("-", h.thisSect.Level+1) // plus one so that the new "-" one is part of the previous 202 } 203 h.thisSect.Body = append(h.thisSect.Body, ebyt...) 204 return nil 205 } 206 207 func byteRender(n *html.Node) ([]byte, error) { 208 var b bytes.Buffer 209 err := html.Render(&b, n) 210 return b.Bytes(), err 211 } 212 213 func byteRenderChildren(n *html.Node) ([]byte, error) { 214 var b bytes.Buffer 215 for c := n.FirstChild; c != nil; c = c.NextSibling { 216 err := html.Render(&b, c) 217 if err != nil { 218 return nil, err 219 } 220 } 221 return b.Bytes(), nil 222 } 223 224 func (h *htmlToSplit) nodeContainsHeading(n *html.Node) bool { 225 val, ok := h.nodeCache[n] 226 if ok { 227 return val 228 } 229 switch n.DataAtom { 230 case atom.H6, atom.H5, atom.H4, atom.H3, atom.H2, atom.H1: 231 h.nodeCache[n] = true 232 return true 233 default: 234 for c := n.FirstChild; c != nil; c = c.NextSibling { 235 if h.nodeContainsHeading(c) { 236 h.nodeCache[n] = true 237 h.nodeCache[c] = true 238 return true 239 } 240 } 241 } 242 h.nodeCache[n] = false 243 return false 244 }