github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean2/03_condense_top_down.go (about) 1 package domclean2 2 3 import ( 4 "strings" 5 6 "golang.org/x/net/html" 7 ) 8 9 func removeEmptyNodes(n *html.Node, lvl int) { 10 11 // children 12 cc := []*html.Node{} 13 for c := n.FirstChild; c != nil; c = c.NextSibling { 14 cc = append(cc, c) 15 } 16 for _, c := range cc { 17 removeEmptyNodes(c, lvl+1) 18 } 19 20 // processing 21 // empty element nodes 22 if n.Type == html.ElementNode && n.Data == "img" { 23 src := attrX(n.Attr, "src") 24 if src == "" { 25 n.Parent.RemoveChild(n) 26 } 27 } 28 29 if n.Type == html.ElementNode && n.FirstChild == nil && n.Data == "a" { 30 href := attrX(n.Attr, "href") 31 if href == "#" || href == "" { 32 n.Parent.RemoveChild(n) 33 } 34 } 35 36 if n.Type == html.ElementNode && n.FirstChild == nil && 37 (n.Data == "em" || n.Data == "strong") { 38 n.Parent.RemoveChild(n) 39 } 40 41 if n.Type == html.ElementNode && n.FirstChild == nil && 42 (n.Data == "div" || n.Data == "span" || n.Data == "li" || n.Data == "p") { 43 n.Parent.RemoveChild(n) 44 } 45 46 // spans with less than 2 characters inside => flatten to text 47 only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild 48 if n.Type == html.ElementNode && 49 n.Data == "span" && 50 only1Child && 51 n.FirstChild.Type == html.TextNode && 52 len(strings.TrimSpace(n.FirstChild.Data)) < 3 { 53 n.Type = html.TextNode 54 n.Data = n.FirstChild.Data 55 n.RemoveChild(n.FirstChild) 56 } 57 58 } 59 60 func condenseTopDown(n *html.Node, lvl, lvlExec int) { 61 62 // like in removeUnwanted, we first assemble children separately. 63 // since "NextSibling" might be set to nil during condension 64 cc := []*html.Node{} 65 for c := n.FirstChild; c != nil; c = c.NextSibling { 66 cc = append(cc, c) 67 } 68 69 for _, c := range cc { 70 condenseTopDown(c, lvl+1, lvlExec) 71 } 72 73 // position at the end => process from deepest level on upwards 74 if lvl == 9 || true { 75 topDownV3(n, map[string]bool{"div": true}, 76 map[string]bool{"div": true, "ul": true, "form": true, "li": true, "p": true, 77 "a": true, "span": true}) 78 79 topDownV3(n, map[string]bool{"li": true}, map[string]bool{"div": true}) 80 81 } 82 83 // condenseTopDown2(n, "li", map[string]bool{"a": true, "div": true}, "li") 84 85 }