github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean2/01_cleanse.go (about) 1 package domclean2 2 3 import ( 4 "github.com/pbberlin/tools/stringspb" 5 "golang.org/x/net/html" 6 ) 7 8 // !DOCTYPE html head 9 // !DOCTYPE html body 10 // 0 1 2 11 const cScaffoldLvls = 2 12 13 var ( 14 ml3 = map[*html.Node]int{} 15 16 directlyRemoveUnwanted = true 17 18 nodeDistinct = map[string]int{} 19 attrDistinct = map[string]int{} 20 21 unwanteds = map[string]bool{ 22 "meta": true, 23 "link": true, 24 "style": true, 25 "iframe": true, 26 "script": true, 27 "noscript": true, 28 29 "canvas": true, 30 "object": true, 31 32 "wbr": true, 33 34 "comment": true, 35 } 36 37 exotics = map[string]string{ 38 "header": "div", 39 "footer": "div", 40 "nav": "div", 41 "section": "div", 42 "article": "div", 43 "aside": "div", 44 45 "fieldset": "div", // check this 46 47 "dl": "ul", 48 "dt": "li", 49 "dd": "p", 50 51 "figure": "div", 52 "figcaption": "p", 53 54 "i": "em", 55 "b": "strong", 56 } 57 58 unwantedAttrs = map[string]bool{ 59 60 "border": true, // check this 61 62 "style": true, 63 "class": true, 64 // "alt": true, 65 // "title": true, 66 67 "align": true, 68 "placeholder": true, 69 70 "target": true, 71 "id": true, 72 "rel": true, 73 "tabindex": true, 74 "headline": true, 75 76 "onload": true, 77 "onclick": true, 78 "onmousedown": true, 79 "onerror": true, 80 "onsubmit": true, 81 82 "readonly": true, 83 "accept-charset": true, 84 85 "itemprop": true, 86 "itemtype": true, 87 "itemscope": true, 88 89 "datetime": true, 90 "current-time": true, 91 "fb-iframe-plugin-query": true, 92 "fb-xfbml-state": true, 93 94 "frameborder": true, 95 "async": true, 96 "charset": true, 97 "http-equiv": true, 98 "allowtransparency": true, 99 "allowfullscreen": true, 100 "scrolling": true, 101 "ftghandled": true, 102 "ftgrandomid": true, 103 "marginwidth": true, 104 "marginheight": true, 105 "vspace": true, 106 "hspace": true, 107 "seamless": true, 108 "aria-hidden": true, 109 "gapi_processed": true, 110 "property": true, 111 "media": true, 112 113 "content": true, 114 "language": true, 115 116 "role": true, 117 "sizes": true, 118 } 119 ) 120 121 // maxTreeDepth returns the depth of given DOM node 122 func maxTreeDepth(n *html.Node, lvl int) (maxLvl int) { 123 124 maxLvl = lvl 125 // Children 126 for c := n.FirstChild; c != nil; c = c.NextSibling { 127 ret := maxTreeDepth(c, lvl+1) 128 if ret > maxLvl { 129 maxLvl = ret 130 } 131 } 132 return 133 } 134 135 // cleansDom performs brute reduction and simplification 136 // 137 func cleanseDom(n *html.Node, lvl int) { 138 139 n.Attr = removeAttr(n.Attr, unwantedAttrs) 140 141 // Children 142 for c := n.FirstChild; c != nil; c = c.NextSibling { 143 cleanseDom(c, lvl+1) 144 } 145 146 if directlyRemoveUnwanted { 147 removeUnwanted(n) 148 } else { 149 convertUnwanted(n) 150 } 151 152 // --- 153 154 convertExotic(n) 155 156 // one time text normalization 157 if n.Type == html.TextNode { 158 n.Data = stringspb.NormalizeInnerWhitespace(n.Data) 159 } 160 161 } 162 163 // convertUnwanted neutralizes a node. 164 // Note: We can not directly Remove() nor Replace() 165 // Since that breaks the recursion one step above! 166 // At a later stage we employ horizontal traversal 167 // to actually remove unwanted nodes. 168 // 169 // Meanwhile we have devised removeUnwanted() which 170 // makes convertUnwanted-removeComment obsolete. 171 // 172 func convertUnwanted(n *html.Node) { 173 if unwanteds[n.Data] { 174 n.Type = html.CommentNode 175 n.Data = n.Data + " replaced" 176 } 177 } 178 179 // We want to remove some children. 180 // A direct loop is impossible, 181 // since "NextSibling" is set to nil during Remove(). 182 // Therefore: 183 // First assemble children separately. 184 // Then remove them. 185 func removeUnwanted(n *html.Node) { 186 cc := []*html.Node{} 187 for c := n.FirstChild; c != nil; c = c.NextSibling { 188 cc = append(cc, c) 189 } 190 for _, c := range cc { 191 if unwanteds[c.Data] { 192 n.RemoveChild(c) 193 } 194 } 195 } 196 197 // convertExotic standardizes <section> or <header> nodes 198 // towards <div> nodes. 199 func convertExotic(n *html.Node) { 200 if repl, ok := exotics[n.Data]; ok { 201 n.Attr = append(n.Attr, html.Attribute{"", "cfrm", n.Data}) 202 n.Data = repl 203 } 204 }