github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean2/03_condense_top_down.go (about)

     1  package domclean2
     2  
     3  import (
     4  	"strings"
     5  
     6  	"golang.org/x/net/html"
     7  )
     8  
     9  func removeEmptyNodes(n *html.Node, lvl int) {
    10  
    11  	// children
    12  	cc := []*html.Node{}
    13  	for c := n.FirstChild; c != nil; c = c.NextSibling {
    14  		cc = append(cc, c)
    15  	}
    16  	for _, c := range cc {
    17  		removeEmptyNodes(c, lvl+1)
    18  	}
    19  
    20  	// processing
    21  	// empty element nodes
    22  	if n.Type == html.ElementNode && n.Data == "img" {
    23  		src := attrX(n.Attr, "src")
    24  		if src == "" {
    25  			n.Parent.RemoveChild(n)
    26  		}
    27  	}
    28  
    29  	if n.Type == html.ElementNode && n.FirstChild == nil && n.Data == "a" {
    30  		href := attrX(n.Attr, "href")
    31  		if href == "#" || href == "" {
    32  			n.Parent.RemoveChild(n)
    33  		}
    34  	}
    35  
    36  	if n.Type == html.ElementNode && n.FirstChild == nil &&
    37  		(n.Data == "em" || n.Data == "strong") {
    38  		n.Parent.RemoveChild(n)
    39  	}
    40  
    41  	if n.Type == html.ElementNode && n.FirstChild == nil &&
    42  		(n.Data == "div" || n.Data == "span" || n.Data == "li" || n.Data == "p") {
    43  		n.Parent.RemoveChild(n)
    44  	}
    45  
    46  	// spans with less than 2 characters inside => flatten to text
    47  	only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild
    48  	if n.Type == html.ElementNode &&
    49  		n.Data == "span" &&
    50  		only1Child &&
    51  		n.FirstChild.Type == html.TextNode &&
    52  		len(strings.TrimSpace(n.FirstChild.Data)) < 3 {
    53  		n.Type = html.TextNode
    54  		n.Data = n.FirstChild.Data
    55  		n.RemoveChild(n.FirstChild)
    56  	}
    57  
    58  }
    59  
    60  func condenseTopDown(n *html.Node, lvl, lvlExec int) {
    61  
    62  	// like in removeUnwanted, we first assemble children separately.
    63  	// since "NextSibling" might be set to nil during condension
    64  	cc := []*html.Node{}
    65  	for c := n.FirstChild; c != nil; c = c.NextSibling {
    66  		cc = append(cc, c)
    67  	}
    68  
    69  	for _, c := range cc {
    70  		condenseTopDown(c, lvl+1, lvlExec)
    71  	}
    72  
    73  	// position at the end => process from deepest level on upwards
    74  	if lvl == 9 || true {
    75  		topDownV3(n, map[string]bool{"div": true},
    76  			map[string]bool{"div": true, "ul": true, "form": true, "li": true, "p": true,
    77  				"a": true, "span": true})
    78  
    79  		topDownV3(n, map[string]bool{"li": true}, map[string]bool{"div": true})
    80  
    81  	}
    82  
    83  	// condenseTopDown2(n, "li", map[string]bool{"a": true, "div": true}, "li")
    84  
    85  }