github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean2/05_breakout_imgs_from_a_trees.go (about)

     1  package domclean2
     2  
     3  import (
     4  	"log"
     5  	"net/url"
     6  	"regexp"
     7  	"strings"
     8  
     9  	"github.com/pbberlin/tools/net/http/dom"
    10  	"golang.org/x/net/html"
    11  )
    12  
    13  var debugBreakOut = false
    14  
    15  func searchImg(n *html.Node, fnd *html.Node, lvl int) (*html.Node, int) {
    16  
    17  	if n.Type == html.ElementNode && n.Data == "img" {
    18  		// log.Printf("  a has img on lvl %v\n", lvl)
    19  		if fnd == nil {
    20  			fnd = n
    21  			return fnd, lvl
    22  		}
    23  	}
    24  
    25  	for c := n.FirstChild; c != nil; c = c.NextSibling {
    26  		fnd, lvlfnd := searchImg(c, fnd, lvl+1)
    27  		if fnd != nil {
    28  			return fnd, lvlfnd
    29  		}
    30  	}
    31  
    32  	return fnd, lvl
    33  }
    34  
    35  type DeleterFunc func(*html.Node, int, bool) bool
    36  
    37  func closureDeleter(until bool) DeleterFunc {
    38  
    39  	// Nodes along the path to the splitting image
    40  	// should never not be removed in *neither* tree
    41  	var splitPath = map[*html.Node]bool{}
    42  
    43  	var fc DeleterFunc
    44  	fc = func(n *html.Node, lvl int, found bool) bool {
    45  
    46  		// fmt.Printf("found %v at l%v\n", found, lvl)
    47  		if n.Data == "img" {
    48  			// fmt.Printf(" found at l%v\n", lvl)
    49  			found = true
    50  			par := n.Parent
    51  			for {
    52  				if par == nil {
    53  					break
    54  				}
    55  				splitPath[par] = true
    56  				par = par.Parent
    57  			}
    58  		}
    59  
    60  		// children
    61  		cc := []*html.Node{}
    62  		for c := n.FirstChild; c != nil; c = c.NextSibling {
    63  			cc = append(cc, c)
    64  		}
    65  		for _, c := range cc {
    66  			found = fc(c, lvl+1, found)
    67  		}
    68  
    69  		//
    70  		// remove
    71  		if lvl > 0 {
    72  			if n.Data == "img" {
    73  				n.Parent.RemoveChild(n)
    74  			} else {
    75  				if !until && !found && !splitPath[n] {
    76  					n.Parent.RemoveChild(n)
    77  				}
    78  				if until && found && !splitPath[n] {
    79  					n.Parent.RemoveChild(n)
    80  				}
    81  			}
    82  		}
    83  
    84  		return found
    85  
    86  	}
    87  
    88  	return fc
    89  
    90  }
    91  
    92  func breakoutImagesFromAnchorTrees(n *html.Node) {
    93  
    94  	for c := n.FirstChild; c != nil; c = c.NextSibling {
    95  		breakoutImagesFromAnchorTrees(c)
    96  	}
    97  
    98  	if n.Type == html.ElementNode && n.Data == "a" {
    99  
   100  		img, lvl := searchImg(n, nil, 0)
   101  
   102  		if img != nil {
   103  
   104  			only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild
   105  			if lvl == 1 && only1Child {
   106  				// log.Printf("only child image lvl %v a\n", lvl)
   107  				n.RemoveChild(img)
   108  				n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end"
   109  				contnt := urlBeautify(attrX(n.Attr, "href"))
   110  				if len(contnt) < 6 {
   111  					contnt = "[was img] " + contnt
   112  				}
   113  				n.AppendChild(dom.Nd("text", contnt))
   114  			} else {
   115  
   116  				if debugBreakOut {
   117  					b0 := dom.PrintSubtree(n)
   118  					log.Printf("\n%s\n", b0)
   119  				}
   120  
   121  				// log.Printf("  got it  %v\n", img.Data)
   122  				a1 := dom.CloneNodeWithSubtree(n)
   123  				fc1 := closureDeleter(true)
   124  				fc1(n, 0, false)
   125  				if debugBreakOut {
   126  					b1 := dom.PrintSubtree(n)
   127  					log.Printf("\n%s\n", b1)
   128  				}
   129  
   130  				fc2 := closureDeleter(false)
   131  				fc2(a1, 0, false)
   132  				if debugBreakOut {
   133  					b2 := dom.PrintSubtree(a1)
   134  					log.Printf("\n%s\n", b2)
   135  					log.Printf("--------------------\n")
   136  				}
   137  
   138  				if true {
   139  					n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end"
   140  					n.Parent.InsertBefore(a1, img.NextSibling)
   141  				} else {
   142  					// old way ; sequence corrpution if n had rightwise siblings.
   143  					n.Parent.AppendChild(img)
   144  					n.Parent.AppendChild(a1)
   145  
   146  				}
   147  
   148  			}
   149  
   150  			// changing image to link later
   151  
   152  		} else {
   153  			// log.Printf("no img in a\n")
   154  		}
   155  	}
   156  
   157  }
   158  
   159  var allNumbers = regexp.MustCompile(`[0-9]+`)
   160  
   161  func urlBeautify(surl string) string {
   162  	if !strings.HasPrefix(surl, "http://") && !strings.HasPrefix(surl, "https://") {
   163  		surl = "https://" + surl
   164  	}
   165  
   166  	url2, err := url.Parse(surl)
   167  	if err != nil {
   168  		return surl
   169  	}
   170  
   171  	hst := url2.Host
   172  	if strings.Count(hst, ".") > 1 {
   173  		parts := strings.Split(hst, ".")
   174  		lenP := len(parts)
   175  		hst = parts[lenP-2] + "." + parts[lenP-1]
   176  	}
   177  
   178  	pth := url2.Path
   179  	pth = allNumbers.ReplaceAllString(pth, "")
   180  
   181  	return hst + pth
   182  
   183  }