github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/wpjsontool/tidy.go (about)

     1  package main
     2  
     3  import (
     4  	"golang.org/x/net/html"
     5  	"golang.org/x/net/html/atom"
     6  	"regexp"
     7  	"strings"
     8  )
     9  
    10  func SanitiseHTMLString(h string) (string, error) {
    11  	bod := &html.Node{
    12  		Type:     html.ElementNode,
    13  		Data:     "body",
    14  		DataAtom: atom.Body,
    15  	}
    16  
    17  	nodes, err := html.ParseFragment(strings.NewReader(h), bod)
    18  	if err != nil {
    19  		return "", err
    20  	}
    21  
    22  	var b strings.Builder
    23  	for _, n := range nodes {
    24  		keep := TidyNode(n)
    25  		if keep {
    26  			err = html.Render(&b, n)
    27  			if err != nil {
    28  				return "", err
    29  			}
    30  		}
    31  	}
    32  
    33  	return b.String(), nil
    34  }
    35  
    36  func SingleLine(s string) string {
    37  	return strings.Join(strings.Fields(s), " ")
    38  }
    39  
    40  // a list of allowed elements and their allowed attrs
    41  // all missing elements or attrs should be stripped
    42  var elementWhitelist = map[atom.Atom][]atom.Atom{
    43  	// basing on list at  https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/HTML5/HTML5_element_list
    44  
    45  	//Sections
    46  	atom.Section: {},
    47  	// atom.Nav?
    48  	atom.Article: {},
    49  	atom.Aside:   {},
    50  	atom.H1:      {},
    51  	atom.H2:      {},
    52  	atom.H3:      {},
    53  	atom.H4:      {},
    54  	atom.H5:      {},
    55  	atom.H6:      {},
    56  	atom.Header:  {}, // should disallow?
    57  	atom.Footer:  {}, // should disallow?
    58  	atom.Address: {},
    59  	//atom.Main?
    60  
    61  	// Grouping content
    62  	atom.P:          {},
    63  	atom.Hr:         {},
    64  	atom.Pre:        {},
    65  	atom.Blockquote: {},
    66  	atom.Ol:         {},
    67  	atom.Ul:         {},
    68  	atom.Li:         {},
    69  	atom.Dl:         {},
    70  	atom.Dt:         {},
    71  	atom.Dd:         {},
    72  	atom.Figure:     {},
    73  	atom.Figcaption: {},
    74  	atom.Div:        {},
    75  
    76  	// Text-level semantics
    77  	atom.A:      {atom.Href},
    78  	atom.Em:     {},
    79  	atom.Font:   {},
    80  	atom.Strong: {},
    81  	atom.Small:  {},
    82  	atom.S:      {},
    83  	atom.Cite:   {},
    84  	atom.Q:      {},
    85  	atom.Dfn:    {},
    86  	atom.Abbr:   {atom.Title},
    87  	// atom.Data
    88  	atom.Time: {atom.Datetime},
    89  	atom.Code: {},
    90  	atom.Var:  {},
    91  	atom.Samp: {},
    92  	atom.Kbd:  {},
    93  	atom.Sub:  {},
    94  	atom.Sup:  {},
    95  	atom.I:    {},
    96  	atom.B:    {},
    97  	atom.U:    {},
    98  	atom.Mark: {},
    99  	atom.Ruby: {},
   100  	atom.Rt:   {},
   101  	atom.Rp:   {},
   102  	atom.Bdi:  {},
   103  	atom.Bdo:  {},
   104  	atom.Span: {},
   105  	atom.Br:   {},
   106  	atom.Wbr:  {},
   107  
   108  	// Edits
   109  	atom.Ins: {},
   110  	atom.Del: {},
   111  
   112  	//Embedded content
   113  	atom.Img: {atom.Src, atom.Alt},
   114  	// atom.Video?
   115  	// atom.Audio?
   116  	// atom.Map?
   117  	// atom.Area?
   118  	// atom.Svg?
   119  	// atom.Math?
   120  
   121  	// Tabular data
   122  	atom.Table:    {},
   123  	atom.Caption:  {},
   124  	atom.Colgroup: {},
   125  	atom.Col:      {},
   126  	atom.Tbody:    {},
   127  	atom.Thead:    {},
   128  	atom.Tfoot:    {},
   129  	atom.Tr:       {},
   130  	atom.Td:       {},
   131  	atom.Th:       {},
   132  
   133  	// Forms
   134  
   135  	// Interactive elements
   136  
   137  }
   138  
   139  func filterAttrs(n *html.Node, fn func(*html.Attribute) bool) {
   140  	var out = make([]html.Attribute, 0)
   141  	for _, a := range n.Attr {
   142  		if fn(&a) {
   143  			out = append(out, a)
   144  		}
   145  	}
   146  	n.Attr = out
   147  }
   148  
   149  // getAttr retrieved the value of an attribute on a node.
   150  // Returns empty string if attribute doesn't exist.
   151  func getAttr(n *html.Node, attr string) string {
   152  	for _, a := range n.Attr {
   153  		if a.Key == attr {
   154  			return a.Val
   155  		}
   156  	}
   157  	return ""
   158  }
   159  
   160  // Tidy up extracted content into something that'll produce reasonable html when
   161  // rendered. Returns true if node should be kept. false to cull.
   162  // - remove comments
   163  // - trim empty text nodes
   164  // - TODO make links absolute
   165  func TidyNode(n *html.Node) bool {
   166  
   167  	if n.Type == html.CommentNode {
   168  		return false // cull comments
   169  	}
   170  
   171  	// trim excessive leading/trailing space in text nodes, and cull empty ones
   172  	if n.Type == html.TextNode {
   173  		leadingSpace := regexp.MustCompile(`^\s+`)
   174  		trailingSpace := regexp.MustCompile(`\s+$`)
   175  		txt := leadingSpace.ReplaceAllStringFunc(n.Data, func(in string) string {
   176  			if strings.Contains(in, "\n") {
   177  				return "\n"
   178  			} else {
   179  				return " "
   180  			}
   181  		})
   182  		txt = trailingSpace.ReplaceAllStringFunc(n.Data, func(in string) string {
   183  			if strings.Contains(in, "\n") {
   184  				return "\n"
   185  			} else {
   186  				return " "
   187  			}
   188  		})
   189  		txt = strings.TrimSpace(txt)
   190  		if len(txt) == 0 {
   191  			return false // cull empty text
   192  		} else {
   193  			n.Data = txt
   194  		}
   195  	}
   196  
   197  	// remove any elements not on the whitelist
   198  	if n.Type == html.ElementNode {
   199  		allowedAttrs, whiteListed := elementWhitelist[n.DataAtom]
   200  		if !whiteListed {
   201  			return false // cull non-whitelist element
   202  		}
   203  
   204  		// remove attrs not on whitelist
   205  		filterAttrs(n, func(attr *html.Attribute) bool {
   206  			for _, allowed := range allowedAttrs {
   207  				if attr.Key == allowed.String() {
   208  					return true
   209  				}
   210  			}
   211  			return false
   212  		})
   213  
   214  		// special logic for images - strip out ones with huge URIs (eg embedded
   215  		// 'data:' + base64 encoded images)
   216  		if n.DataAtom == atom.Img {
   217  			const maxSrcURI = 1024
   218  			src := getAttr(n, "src")
   219  			if len(src) > maxSrcURI {
   220  				return false // cull: URI too big
   221  			}
   222  		}
   223  	}
   224  
   225  	// recurse
   226  	for child := n.FirstChild; child != nil; {
   227  		c := child
   228  		// fetch next one in advance (because current one be removed)
   229  		child = child.NextSibling
   230  		keep := TidyNode(c)
   231  		if !keep {
   232  			n.RemoveChild(c)
   233  		}
   234  	}
   235  
   236  	return true
   237  }