github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/wpjsontool/tidy.go (about) 1 package main 2 3 import ( 4 "golang.org/x/net/html" 5 "golang.org/x/net/html/atom" 6 "regexp" 7 "strings" 8 ) 9 10 func SanitiseHTMLString(h string) (string, error) { 11 bod := &html.Node{ 12 Type: html.ElementNode, 13 Data: "body", 14 DataAtom: atom.Body, 15 } 16 17 nodes, err := html.ParseFragment(strings.NewReader(h), bod) 18 if err != nil { 19 return "", err 20 } 21 22 var b strings.Builder 23 for _, n := range nodes { 24 keep := TidyNode(n) 25 if keep { 26 err = html.Render(&b, n) 27 if err != nil { 28 return "", err 29 } 30 } 31 } 32 33 return b.String(), nil 34 } 35 36 func SingleLine(s string) string { 37 return strings.Join(strings.Fields(s), " ") 38 } 39 40 // a list of allowed elements and their allowed attrs 41 // all missing elements or attrs should be stripped 42 var elementWhitelist = map[atom.Atom][]atom.Atom{ 43 // basing on list at https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/HTML5/HTML5_element_list 44 45 //Sections 46 atom.Section: {}, 47 // atom.Nav? 48 atom.Article: {}, 49 atom.Aside: {}, 50 atom.H1: {}, 51 atom.H2: {}, 52 atom.H3: {}, 53 atom.H4: {}, 54 atom.H5: {}, 55 atom.H6: {}, 56 atom.Header: {}, // should disallow? 57 atom.Footer: {}, // should disallow? 58 atom.Address: {}, 59 //atom.Main? 60 61 // Grouping content 62 atom.P: {}, 63 atom.Hr: {}, 64 atom.Pre: {}, 65 atom.Blockquote: {}, 66 atom.Ol: {}, 67 atom.Ul: {}, 68 atom.Li: {}, 69 atom.Dl: {}, 70 atom.Dt: {}, 71 atom.Dd: {}, 72 atom.Figure: {}, 73 atom.Figcaption: {}, 74 atom.Div: {}, 75 76 // Text-level semantics 77 atom.A: {atom.Href}, 78 atom.Em: {}, 79 atom.Font: {}, 80 atom.Strong: {}, 81 atom.Small: {}, 82 atom.S: {}, 83 atom.Cite: {}, 84 atom.Q: {}, 85 atom.Dfn: {}, 86 atom.Abbr: {atom.Title}, 87 // atom.Data 88 atom.Time: {atom.Datetime}, 89 atom.Code: {}, 90 atom.Var: {}, 91 atom.Samp: {}, 92 atom.Kbd: {}, 93 atom.Sub: {}, 94 atom.Sup: {}, 95 atom.I: {}, 96 atom.B: {}, 97 atom.U: {}, 98 atom.Mark: {}, 99 atom.Ruby: {}, 100 atom.Rt: {}, 101 atom.Rp: {}, 102 atom.Bdi: {}, 103 atom.Bdo: {}, 104 atom.Span: {}, 105 atom.Br: {}, 106 atom.Wbr: {}, 107 108 // Edits 109 atom.Ins: {}, 110 atom.Del: {}, 111 112 //Embedded content 113 atom.Img: {atom.Src, atom.Alt}, 114 // atom.Video? 115 // atom.Audio? 116 // atom.Map? 117 // atom.Area? 118 // atom.Svg? 119 // atom.Math? 120 121 // Tabular data 122 atom.Table: {}, 123 atom.Caption: {}, 124 atom.Colgroup: {}, 125 atom.Col: {}, 126 atom.Tbody: {}, 127 atom.Thead: {}, 128 atom.Tfoot: {}, 129 atom.Tr: {}, 130 atom.Td: {}, 131 atom.Th: {}, 132 133 // Forms 134 135 // Interactive elements 136 137 } 138 139 func filterAttrs(n *html.Node, fn func(*html.Attribute) bool) { 140 var out = make([]html.Attribute, 0) 141 for _, a := range n.Attr { 142 if fn(&a) { 143 out = append(out, a) 144 } 145 } 146 n.Attr = out 147 } 148 149 // getAttr retrieved the value of an attribute on a node. 150 // Returns empty string if attribute doesn't exist. 151 func getAttr(n *html.Node, attr string) string { 152 for _, a := range n.Attr { 153 if a.Key == attr { 154 return a.Val 155 } 156 } 157 return "" 158 } 159 160 // Tidy up extracted content into something that'll produce reasonable html when 161 // rendered. Returns true if node should be kept. false to cull. 162 // - remove comments 163 // - trim empty text nodes 164 // - TODO make links absolute 165 func TidyNode(n *html.Node) bool { 166 167 if n.Type == html.CommentNode { 168 return false // cull comments 169 } 170 171 // trim excessive leading/trailing space in text nodes, and cull empty ones 172 if n.Type == html.TextNode { 173 leadingSpace := regexp.MustCompile(`^\s+`) 174 trailingSpace := regexp.MustCompile(`\s+$`) 175 txt := leadingSpace.ReplaceAllStringFunc(n.Data, func(in string) string { 176 if strings.Contains(in, "\n") { 177 return "\n" 178 } else { 179 return " " 180 } 181 }) 182 txt = trailingSpace.ReplaceAllStringFunc(n.Data, func(in string) string { 183 if strings.Contains(in, "\n") { 184 return "\n" 185 } else { 186 return " " 187 } 188 }) 189 txt = strings.TrimSpace(txt) 190 if len(txt) == 0 { 191 return false // cull empty text 192 } else { 193 n.Data = txt 194 } 195 } 196 197 // remove any elements not on the whitelist 198 if n.Type == html.ElementNode { 199 allowedAttrs, whiteListed := elementWhitelist[n.DataAtom] 200 if !whiteListed { 201 return false // cull non-whitelist element 202 } 203 204 // remove attrs not on whitelist 205 filterAttrs(n, func(attr *html.Attribute) bool { 206 for _, allowed := range allowedAttrs { 207 if attr.Key == allowed.String() { 208 return true 209 } 210 } 211 return false 212 }) 213 214 // special logic for images - strip out ones with huge URIs (eg embedded 215 // 'data:' + base64 encoded images) 216 if n.DataAtom == atom.Img { 217 const maxSrcURI = 1024 218 src := getAttr(n, "src") 219 if len(src) > maxSrcURI { 220 return false // cull: URI too big 221 } 222 } 223 } 224 225 // recurse 226 for child := n.FirstChild; child != nil; { 227 c := child 228 // fetch next one in advance (because current one be removed) 229 child = child.NextSibling 230 keep := TidyNode(c) 231 if !keep { 232 n.RemoveChild(c) 233 } 234 } 235 236 return true 237 }