github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean2/05_breakout_imgs_from_a_trees.go (about) 1 package domclean2 2 3 import ( 4 "log" 5 "net/url" 6 "regexp" 7 "strings" 8 9 "github.com/pbberlin/tools/net/http/dom" 10 "golang.org/x/net/html" 11 ) 12 13 var debugBreakOut = false 14 15 func searchImg(n *html.Node, fnd *html.Node, lvl int) (*html.Node, int) { 16 17 if n.Type == html.ElementNode && n.Data == "img" { 18 // log.Printf(" a has img on lvl %v\n", lvl) 19 if fnd == nil { 20 fnd = n 21 return fnd, lvl 22 } 23 } 24 25 for c := n.FirstChild; c != nil; c = c.NextSibling { 26 fnd, lvlfnd := searchImg(c, fnd, lvl+1) 27 if fnd != nil { 28 return fnd, lvlfnd 29 } 30 } 31 32 return fnd, lvl 33 } 34 35 type DeleterFunc func(*html.Node, int, bool) bool 36 37 func closureDeleter(until bool) DeleterFunc { 38 39 // Nodes along the path to the splitting image 40 // should never not be removed in *neither* tree 41 var splitPath = map[*html.Node]bool{} 42 43 var fc DeleterFunc 44 fc = func(n *html.Node, lvl int, found bool) bool { 45 46 // fmt.Printf("found %v at l%v\n", found, lvl) 47 if n.Data == "img" { 48 // fmt.Printf(" found at l%v\n", lvl) 49 found = true 50 par := n.Parent 51 for { 52 if par == nil { 53 break 54 } 55 splitPath[par] = true 56 par = par.Parent 57 } 58 } 59 60 // children 61 cc := []*html.Node{} 62 for c := n.FirstChild; c != nil; c = c.NextSibling { 63 cc = append(cc, c) 64 } 65 for _, c := range cc { 66 found = fc(c, lvl+1, found) 67 } 68 69 // 70 // remove 71 if lvl > 0 { 72 if n.Data == "img" { 73 n.Parent.RemoveChild(n) 74 } else { 75 if !until && !found && !splitPath[n] { 76 n.Parent.RemoveChild(n) 77 } 78 if until && found && !splitPath[n] { 79 n.Parent.RemoveChild(n) 80 } 81 } 82 } 83 84 return found 85 86 } 87 88 return fc 89 90 } 91 92 func breakoutImagesFromAnchorTrees(n *html.Node) { 93 94 for c := n.FirstChild; c != nil; c = c.NextSibling { 95 breakoutImagesFromAnchorTrees(c) 96 } 97 98 if n.Type == html.ElementNode && n.Data == "a" { 99 100 img, lvl := searchImg(n, nil, 0) 101 102 if img != nil { 103 104 only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild 105 if lvl == 1 && only1Child { 106 // log.Printf("only child image lvl %v a\n", lvl) 107 n.RemoveChild(img) 108 n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end" 109 contnt := urlBeautify(attrX(n.Attr, "href")) 110 if len(contnt) < 6 { 111 contnt = "[was img] " + contnt 112 } 113 n.AppendChild(dom.Nd("text", contnt)) 114 } else { 115 116 if debugBreakOut { 117 b0 := dom.PrintSubtree(n) 118 log.Printf("\n%s\n", b0) 119 } 120 121 // log.Printf(" got it %v\n", img.Data) 122 a1 := dom.CloneNodeWithSubtree(n) 123 fc1 := closureDeleter(true) 124 fc1(n, 0, false) 125 if debugBreakOut { 126 b1 := dom.PrintSubtree(n) 127 log.Printf("\n%s\n", b1) 128 } 129 130 fc2 := closureDeleter(false) 131 fc2(a1, 0, false) 132 if debugBreakOut { 133 b2 := dom.PrintSubtree(a1) 134 log.Printf("\n%s\n", b2) 135 log.Printf("--------------------\n") 136 } 137 138 if true { 139 n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end" 140 n.Parent.InsertBefore(a1, img.NextSibling) 141 } else { 142 // old way ; sequence corrpution if n had rightwise siblings. 143 n.Parent.AppendChild(img) 144 n.Parent.AppendChild(a1) 145 146 } 147 148 } 149 150 // changing image to link later 151 152 } else { 153 // log.Printf("no img in a\n") 154 } 155 } 156 157 } 158 159 var allNumbers = regexp.MustCompile(`[0-9]+`) 160 161 func urlBeautify(surl string) string { 162 if !strings.HasPrefix(surl, "http://") && !strings.HasPrefix(surl, "https://") { 163 surl = "https://" + surl 164 } 165 166 url2, err := url.Parse(surl) 167 if err != nil { 168 return surl 169 } 170 171 hst := url2.Host 172 if strings.Count(hst, ".") > 1 { 173 parts := strings.Split(hst, ".") 174 lenP := len(parts) 175 hst = parts[lenP-2] + "." + parts[lenP-1] 176 } 177 178 pth := url2.Path 179 pth = allNumbers.ReplaceAllString(pth, "") 180 181 return hst + pth 182 183 }