github.com/elliott5/community@v0.14.1-0.20160709191136-823126fb026a/wordsmith/utility/html.go

github.com/elliott5/community@v0.14.1-0.20160709191136-823126fb026a/wordsmith/utility/html.go (about)

     1  // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
     2  //
     3  // This software (Documize Community Edition) is licensed under
     4  // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
     5  //
     6  // You can operate outside the AGPL restrictions by purchasing
     7  // Documize Enterprise Edition and obtaining a commercial license
     8  // by contacting <sales@documize.com>.
     9  //
    10  // https://documize.com
    11  
    12  package utility
    13  
    14  import (
    15  	"bytes"
    16  	"fmt"
    17  	"io"
    18  	"strings"
    19  	"unicode/utf8"
    20  
    21  	"golang.org/x/net/html"
    22  	"golang.org/x/net/html/atom"
    23  
    24  	"github.com/documize/community/wordsmith/log"
    25  )
    26  
    27  // HTML describes a chunk of HTML, Text() method returns plain text.
    28  type HTML string
    29  
    30  // write out the textual element of the html node, if present, then iterate through the child nodes.
    31  func writeText(n *html.Node, b io.Writer, isTest bool) {
    32  	if !excluded(n) {
    33  		switch n.Type {
    34  		case html.TextNode:
    35  			_, err := b.Write([]byte(n.Data + string(rune(0x200B)))) // + http://en.wikipedia.org/wiki/Zero-width_space
    36  			if err != nil {
    37  				log.Error("write TextNode", err)
    38  			}
    39  			// TODO This use of zero-width-space (subsequently replaced by ' ' or ignored, depending on context)
    40  			// TODO works well for in-word breaks, but at the expense of concatenating some words in error.
    41  			// TODO It may be that better examination of the HTML structure could be used to determine
    42  			// TODO when a space is, or is not, required. In that event we would not use zero-width-space.
    43  
    44  		default:
    45  			for c := n.FirstChild; c != nil; c = c.NextSibling {
    46  				writeText(c, b, isTest)
    47  			}
    48  			switch n.DataAtom {
    49  			case 0:
    50  				if n.Data == "documize" {
    51  					for _, a := range n.Attr {
    52  						if a.Key == "type" {
    53  							if isTest {
    54  								var err error
    55  								switch a.Val {
    56  								case "field-start":
    57  									_, err = b.Write([]byte(" [ "))
    58  								case "field-end":
    59  									_, err = b.Write([]byte(" ] "))
    60  								default:
    61  									_, err = b.Write([]byte(" [ ] "))
    62  								}
    63  								if err != nil {
    64  									log.Error("write []", err)
    65  								}
    66  							}
    67  							return
    68  						}
    69  					}
    70  				}
    71  			case atom.Span, atom.U, atom.B, atom.I, atom.Del, atom.Sub, atom.Sup:
    72  				//NoOp
    73  			default:
    74  				_, err := b.Write([]byte(" ")) // add a space after each main element
    75  				if err != nil {
    76  					log.Error("write space", err)
    77  				}
    78  			}
    79  		}
    80  	}
    81  }
    82  
    83  func excluded(n *html.Node) bool {
    84  	if n.DataAtom == atom.Div {
    85  		for _, a := range n.Attr {
    86  			if a.Key == "class" {
    87  				switch a.Val {
    88  				case "documize-first-page",
    89  					"documize-exotic-image",
    90  					"documize-footnote",
    91  					"documize-graphictext",
    92  					"documize-math":
    93  					return true
    94  				}
    95  			}
    96  		}
    97  	}
    98  	return false
    99  }
   100  
   101  // findBody finds the body HTML node if it exists in the tree. Required to bypass the page title text.
   102  func findBody(n *html.Node) *html.Node {
   103  	if n.DataAtom == atom.Body {
   104  		return n
   105  	}
   106  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   107  		r := findBody(c)
   108  		if r != nil {
   109  			return r
   110  		}
   111  	}
   112  	return nil
   113  }
   114  
   115  // Text returns only the plain text elements of the HTML Chunk, concatanated with "\n",
   116  // for use in the TOC or for text indexing.
   117  func (ch HTML) Text(isTest bool) (string, error) {
   118  	var b bytes.Buffer
   119  	doc, err := html.Parse(strings.NewReader(string(ch)))
   120  	if err != nil {
   121  		return "", err
   122  	}
   123  	body := findBody(doc)
   124  	if body == nil {
   125  		body = doc
   126  	}
   127  	writeText(body, &b, isTest)
   128  	return string(b.Bytes()), nil
   129  }
   130  
   131  // EscapeHTMLcomplexChars looks for "complex" characters within HTML
   132  // and replaces them with the HTML escape codes which describe them.
   133  // "Complex" characters are those encoded in more than one byte by UTF8.
   134  func EscapeHTMLcomplexChars(s string) string {
   135  	ret := ""
   136  	for _, r := range s {
   137  		if utf8.RuneLen(r) > 1 {
   138  			ret += fmt.Sprintf("&#%d;", r)
   139  		} else {
   140  			ret += string(r)
   141  		}
   142  	}
   143  	return ret
   144  }
   145  
   146  // EscapeHTMLcomplexCharsByte looks for "complex" characters within HTML
   147  // and replaces them with the HTML escape codes which describe them.
   148  // "Complex" characters are those encoded in more than one byte by UTF8.
   149  func EscapeHTMLcomplexCharsByte(b []byte) []byte {
   150  	var ret bytes.Buffer
   151  	for len(b) > 0 {
   152  		r, size := utf8.DecodeRune(b)
   153  		if utf8.RuneLen(r) > 1 {
   154  			fmt.Fprintf(&ret, "&#%d;", r)
   155  		} else {
   156  			_, err := ret.Write(b[:size])
   157  			if err != nil {
   158  				log.Error("EscapeHTMLcomplexCharsByte", err)
   159  			}
   160  		}
   161  		b = b[size:]
   162  	}
   163  	return ret.Bytes()
   164  }