github.com/jancarloviray/community@v0.41.1-0.20170124221257-33a66c87cf2f/core/api/convert/html/html.go

github.com/jancarloviray/community@v0.41.1-0.20170124221257-33a66c87cf2f/core/api/convert/html/html.go (about)

     1  // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
     2  //
     3  // This software (Documize Community Edition) is licensed under
     4  // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
     5  //
     6  // You can operate outside the AGPL restrictions by purchasing
     7  // Documize Enterprise Edition and obtaining a commercial license
     8  // by contacting <sales@documize.com>.
     9  //
    10  // https://documize.com
    11  
    12  package html
    13  
    14  import (
    15  	"bytes"
    16  	"fmt"
    17  	"strings"
    18  
    19  	api "github.com/documize/community/core/convapi"
    20  	"github.com/documize/community/core/log"
    21  	"github.com/documize/community/core/utility"
    22  
    23  	"golang.org/x/net/html"
    24  	"golang.org/x/net/html/atom"
    25  
    26  	"golang.org/x/net/context"
    27  )
    28  
    29  const maxTitle = 2000   // NOTE: must be the same length as database page.title
    30  const maxBody = 4000000 // NOTE: must be less than the mysql max_allowed_packet limit, amongst other values
    31  
    32  type htmlToSplit struct {
    33  	CFR       *api.DocumentConversionResponse
    34  	thisSect  api.Page
    35  	nodeCache map[*html.Node]bool
    36  }
    37  
    38  // Convert provides the standard interface for conversion of an HTML document.
    39  // All the function does is return a pointer to api.DocumentConversionResponse with
    40  // PagesHTML set to the given (*api.DocumentConversionRequest).Filedata - so effectively a no-op.
    41  func Convert(ctx context.Context, in interface{}) (interface{}, error) {
    42  	return &api.DocumentConversionResponse{
    43  		PagesHTML: in.(*api.DocumentConversionRequest).Filedata}, nil
    44  }
    45  
    46  // SplitIfHTML splits HTML code into pages, if it exists.
    47  func SplitIfHTML(req *api.DocumentConversionRequest, res *api.DocumentConversionResponse) error {
    48  	if len(res.PagesHTML) == 0 {
    49  		return nil
    50  	}
    51  	hd := &htmlToSplit{CFR: res, nodeCache: make(map[*html.Node]bool)}
    52  	err := hd.testableSplit(req, res)
    53  	/*
    54  		for k, v := range hd.CFR.Pages {
    55  			fmt.Printf("DEBUG hd.CFR.Pages[%d] = Level: %d Title: %s len(Body)=%d\n",
    56  				k, v.Level, v.Title, len(v.Body))
    57  		}
    58  	*/
    59  	return err
    60  }
    61  
    62  // testableSplit, NOTE pointer receiver so that test code can inspect generated datastructures.
    63  func (h *htmlToSplit) testableSplit(request *api.DocumentConversionRequest,
    64  	response *api.DocumentConversionResponse) error {
    65  	doc, err := html.Parse(bytes.NewReader(response.PagesHTML))
    66  	if err != nil {
    67  		return err
    68  	}
    69  	if doc.Type != html.DocumentNode {
    70  		return fmt.Errorf("no HTML document node")
    71  	}
    72  	for htm := doc.FirstChild; htm != nil; htm = htm.NextSibling {
    73  		if htm.Type == html.ElementNode && htm.DataAtom == atom.Html {
    74  			for bdy := htm.FirstChild; bdy != nil; bdy = bdy.NextSibling {
    75  				if bdy.Type == html.ElementNode && bdy.DataAtom == atom.Body {
    76  					h.thisSect = api.Page{
    77  						Level: 1,
    78  						Title: utility.BeautifyFilename(request.Filename),
    79  						Body:  []byte(``)}
    80  					err := h.processChildren(bdy)
    81  					if err != nil {
    82  						h.CFR.Err = err.Error()
    83  					}
    84  					h.CFR.Pages = append(h.CFR.Pages, h.thisSect)
    85  				}
    86  			}
    87  		}
    88  	}
    89  	return nil
    90  }
    91  
    92  func getLevel(at atom.Atom) uint64 {
    93  	level := uint64(1)
    94  	switch at {
    95  	case atom.H6:
    96  		level++
    97  		fallthrough
    98  	case atom.H5:
    99  		level++
   100  		fallthrough
   101  	case atom.H4:
   102  		level++
   103  		fallthrough
   104  	case atom.H3:
   105  		level++
   106  		fallthrough
   107  	case atom.H2:
   108  		level++
   109  		fallthrough
   110  	case atom.H1:
   111  		level++
   112  	}
   113  	return level
   114  }
   115  
   116  func (h *htmlToSplit) processChildren(bdy *html.Node) error {
   117  	for c := bdy.FirstChild; c != nil; c = c.NextSibling {
   118  		var err error
   119  		if c.Type == html.ElementNode {
   120  			if level := getLevel(c.DataAtom); level > 1 {
   121  				err = h.renderHeading(c, level)
   122  			} else {
   123  				err = h.renderNonHeading(c)
   124  			}
   125  		} else {
   126  			err = h.renderAppend(c)
   127  		}
   128  		if err != nil {
   129  			return err
   130  		}
   131  	}
   132  	return nil
   133  }
   134  
   135  func stripZeroWidthSpaces(str string) string {
   136  	ret := ""
   137  	for _, r := range str {
   138  		if r != 0x200B { // zero width space
   139  			ret += string(r) // stripped of zero-width spaces
   140  		}
   141  	}
   142  	return ret
   143  }
   144  
   145  func (h *htmlToSplit) renderHeading(c *html.Node, level uint64) error {
   146  	byt, err := byteRenderChildren(c) // get heading html
   147  	if err != nil {
   148  		return err
   149  	}
   150  	str, err := utility.HTML(string(byt)).Text(false) // heading text
   151  	if err != nil {
   152  		return err
   153  	}
   154  	str = stripZeroWidthSpaces(str)
   155  	if strings.TrimSpace(str) != "" { // only put in non-empty headings
   156  		h.newSect(str, level)
   157  	}
   158  	return nil
   159  }
   160  
   161  func (h *htmlToSplit) newSect(tstr string, level uint64) {
   162  	h.CFR.Pages = append(h.CFR.Pages, h.thisSect)
   163  	title := tstr //was: utility.EscapeHTMLcomplexChars(tstr) -- removed to avoid double-escaping
   164  	body := ``
   165  	if len(title) > maxTitle {
   166  		body = title[maxTitle:]
   167  		title = title[:maxTitle]
   168  	}
   169  	h.thisSect = api.Page{
   170  		Level: level,
   171  		Title: title,
   172  		Body:  []byte(body)}
   173  }
   174  
   175  func (h *htmlToSplit) renderNonHeading(c *html.Node) error {
   176  	if h.nodeContainsHeading(c) { // ignore this atom in order to get at the contents
   177  		err := h.processChildren(c)
   178  		if err != nil {
   179  			return err
   180  		}
   181  	} else {
   182  		if err := h.renderAppend(c); err != nil {
   183  			return err
   184  		}
   185  	}
   186  	return nil
   187  }
   188  
   189  func (h *htmlToSplit) renderAppend(c *html.Node) error {
   190  	byt, err := byteRender(c)
   191  	if err != nil {
   192  		return err
   193  	}
   194  	ebyt := utility.EscapeHTMLcomplexCharsByte(byt)
   195  	if len(ebyt) > maxBody {
   196  		msg := fmt.Sprintf("(Documize warning: HTML render element ignored, size of %d exceeded maxBody of %d.)", len(ebyt), maxBody)
   197  		log.Info(msg)
   198  		ebyt = []byte("<p><b>" + msg + "</b></p>")
   199  	}
   200  	if len(h.thisSect.Body)+len(ebyt) > maxBody {
   201  		h.newSect("-", h.thisSect.Level+1) // plus one so that the new "-" one is part of the previous
   202  	}
   203  	h.thisSect.Body = append(h.thisSect.Body, ebyt...)
   204  	return nil
   205  }
   206  
   207  func byteRender(n *html.Node) ([]byte, error) {
   208  	var b bytes.Buffer
   209  	err := html.Render(&b, n)
   210  	return b.Bytes(), err
   211  }
   212  
   213  func byteRenderChildren(n *html.Node) ([]byte, error) {
   214  	var b bytes.Buffer
   215  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   216  		err := html.Render(&b, c)
   217  		if err != nil {
   218  			return nil, err
   219  		}
   220  	}
   221  	return b.Bytes(), nil
   222  }
   223  
   224  func (h *htmlToSplit) nodeContainsHeading(n *html.Node) bool {
   225  	val, ok := h.nodeCache[n]
   226  	if ok {
   227  		return val
   228  	}
   229  	switch n.DataAtom {
   230  	case atom.H6, atom.H5, atom.H4, atom.H3, atom.H2, atom.H1:
   231  		h.nodeCache[n] = true
   232  		return true
   233  	default:
   234  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   235  			if h.nodeContainsHeading(c) {
   236  				h.nodeCache[n] = true
   237  				h.nodeCache[c] = true
   238  				return true
   239  			}
   240  		}
   241  	}
   242  	h.nodeCache[n] = false
   243  	return false
   244  }