github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean2/01_cleanse.go (about)

     1  package domclean2
     2  
     3  import (
     4  	"github.com/pbberlin/tools/stringspb"
     5  	"golang.org/x/net/html"
     6  )
     7  
     8  // !DOCTYPE html head
     9  // !DOCTYPE html body
    10  //        0    1    2
    11  const cScaffoldLvls = 2
    12  
    13  var (
    14  	ml3 = map[*html.Node]int{}
    15  
    16  	directlyRemoveUnwanted = true
    17  
    18  	nodeDistinct = map[string]int{}
    19  	attrDistinct = map[string]int{}
    20  
    21  	unwanteds = map[string]bool{
    22  		"meta":     true,
    23  		"link":     true,
    24  		"style":    true,
    25  		"iframe":   true,
    26  		"script":   true,
    27  		"noscript": true,
    28  
    29  		"canvas": true,
    30  		"object": true,
    31  
    32  		"wbr": true,
    33  
    34  		"comment": true,
    35  	}
    36  
    37  	exotics = map[string]string{
    38  		"header":  "div",
    39  		"footer":  "div",
    40  		"nav":     "div",
    41  		"section": "div",
    42  		"article": "div",
    43  		"aside":   "div",
    44  
    45  		"fieldset": "div", // check this
    46  
    47  		"dl": "ul",
    48  		"dt": "li",
    49  		"dd": "p",
    50  
    51  		"figure":     "div",
    52  		"figcaption": "p",
    53  
    54  		"i": "em",
    55  		"b": "strong",
    56  	}
    57  
    58  	unwantedAttrs = map[string]bool{
    59  
    60  		"border": true, // check this
    61  
    62  		"style": true,
    63  		"class": true,
    64  		// "alt":                 true,
    65  		// "title":               true,
    66  
    67  		"align":       true,
    68  		"placeholder": true,
    69  
    70  		"target":   true,
    71  		"id":       true,
    72  		"rel":      true,
    73  		"tabindex": true,
    74  		"headline": true,
    75  
    76  		"onload":      true,
    77  		"onclick":     true,
    78  		"onmousedown": true,
    79  		"onerror":     true,
    80  		"onsubmit":    true,
    81  
    82  		"readonly":       true,
    83  		"accept-charset": true,
    84  
    85  		"itemprop":  true,
    86  		"itemtype":  true,
    87  		"itemscope": true,
    88  
    89  		"datetime":               true,
    90  		"current-time":           true,
    91  		"fb-iframe-plugin-query": true,
    92  		"fb-xfbml-state":         true,
    93  
    94  		"frameborder":       true,
    95  		"async":             true,
    96  		"charset":           true,
    97  		"http-equiv":        true,
    98  		"allowtransparency": true,
    99  		"allowfullscreen":   true,
   100  		"scrolling":         true,
   101  		"ftghandled":        true,
   102  		"ftgrandomid":       true,
   103  		"marginwidth":       true,
   104  		"marginheight":      true,
   105  		"vspace":            true,
   106  		"hspace":            true,
   107  		"seamless":          true,
   108  		"aria-hidden":       true,
   109  		"gapi_processed":    true,
   110  		"property":          true,
   111  		"media":             true,
   112  
   113  		"content":  true,
   114  		"language": true,
   115  
   116  		"role":  true,
   117  		"sizes": true,
   118  	}
   119  )
   120  
   121  // maxTreeDepth returns the depth of given DOM node
   122  func maxTreeDepth(n *html.Node, lvl int) (maxLvl int) {
   123  
   124  	maxLvl = lvl
   125  	// Children
   126  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   127  		ret := maxTreeDepth(c, lvl+1)
   128  		if ret > maxLvl {
   129  			maxLvl = ret
   130  		}
   131  	}
   132  	return
   133  }
   134  
   135  // cleansDom performs brute reduction and simplification
   136  //
   137  func cleanseDom(n *html.Node, lvl int) {
   138  
   139  	n.Attr = removeAttr(n.Attr, unwantedAttrs)
   140  
   141  	// Children
   142  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   143  		cleanseDom(c, lvl+1)
   144  	}
   145  
   146  	if directlyRemoveUnwanted {
   147  		removeUnwanted(n)
   148  	} else {
   149  		convertUnwanted(n)
   150  	}
   151  
   152  	// ---
   153  
   154  	convertExotic(n)
   155  
   156  	// one time text normalization
   157  	if n.Type == html.TextNode {
   158  		n.Data = stringspb.NormalizeInnerWhitespace(n.Data)
   159  	}
   160  
   161  }
   162  
   163  // convertUnwanted neutralizes a node.
   164  // Note: We can not directly Remove() nor Replace()
   165  // Since that breaks the recursion one step above!
   166  // At a later stage we employ horizontal traversal
   167  // to actually remove unwanted nodes.
   168  //
   169  // Meanwhile we have devised removeUnwanted() which
   170  // makes convertUnwanted-removeComment obsolete.
   171  //
   172  func convertUnwanted(n *html.Node) {
   173  	if unwanteds[n.Data] {
   174  		n.Type = html.CommentNode
   175  		n.Data = n.Data + " replaced"
   176  	}
   177  }
   178  
   179  // We want to remove some children.
   180  // A direct loop is impossible,
   181  // since "NextSibling" is set to nil during Remove().
   182  // Therefore:
   183  //   First assemble children separately.
   184  //   Then remove them.
   185  func removeUnwanted(n *html.Node) {
   186  	cc := []*html.Node{}
   187  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   188  		cc = append(cc, c)
   189  	}
   190  	for _, c := range cc {
   191  		if unwanteds[c.Data] {
   192  			n.RemoveChild(c)
   193  		}
   194  	}
   195  }
   196  
   197  // convertExotic standardizes <section> or <header> nodes
   198  // towards <div> nodes.
   199  func convertExotic(n *html.Node) {
   200  	if repl, ok := exotics[n.Data]; ok {
   201  		n.Attr = append(n.Attr, html.Attribute{"", "cfrm", n.Data})
   202  		n.Data = repl
   203  	}
   204  }