github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean1/1_cleanup.go (about) 1 // Package domclean1 normalizes html dom trees in a primitive way. 2 package domclean1 3 4 import ( 5 "bytes" 6 "fmt" 7 "net/http" 8 "net/url" 9 "strings" 10 11 "github.com/pbberlin/tools/net/http/dom" 12 "github.com/pbberlin/tools/net/http/fetch" 13 "golang.org/x/net/html" 14 ) 15 16 var fCondenseNode func(*html.Node, int) string 17 var fRecurse func(*html.Node) 18 19 const emptySrc = "//:0" 20 21 // r is the request to the proxy 22 // u is the url, that the proxy has called 23 func ModifyHTML(r *http.Request, u *url.URL, s string) string { 24 25 var nums int // counter 26 27 // needed to get the current request into the 28 // "static" recursive functions 29 var PackageProxyHost = r.Host // port included! 30 var PackageRemoteHost = fetch.HostFromUrl(u) 31 32 fCondenseNode = func(n *html.Node, depth int) (ret string) { 33 34 if n.Type == html.ElementNode && n.Data == "script" { 35 ret += fmt.Sprintf(" var script%v = '[script]'; ", nums) 36 nums++ 37 return 38 } 39 if n.Type == html.ElementNode && n.Data == "style" { 40 ret += fmt.Sprintf(" .xxx {margin:2px;} ") 41 return 42 } 43 44 if n.Type == html.ElementNode && n.Data == "img" { 45 ret += fmt.Sprintf(" [img] %v %v | ", getAttrVal(n.Attr, "alt"), getAttrVal(n.Attr, "src")) 46 } 47 48 if n.Type == html.ElementNode && n.Data == "a" { 49 ret += "[a]" 50 } 51 52 if n.Type == html.TextNode { 53 s := n.Data 54 // s = replTabsNewline.Replace(s) 55 // s = strings.TrimSpace(s) 56 if len(s) < 4 { 57 ret += s 58 } else if s != "" { 59 if depth > 0 { 60 ret += fmt.Sprintf(" [txt%v] %v", depth, s) 61 } else { 62 ret += " [txt] " + s 63 } 64 } 65 } 66 67 for c := n.FirstChild; c != nil; c = c.NextSibling { 68 ret += fCondenseNode(c, depth+1) 69 } 70 return 71 } 72 73 // -------------------------- 74 // ---------------------- 75 76 fRecurse = func(n *html.Node) { 77 78 if n.Type == html.ElementNode && n.Data == "form" { 79 hidFld := new(html.Node) 80 hidFld.Type = html.ElementNode 81 hidFld.Data = "input" 82 hidFld.Attr = []html.Attribute{ 83 html.Attribute{Key: "name", Val: "redirect-to"}, 84 html.Attribute{Key: "value", Val: absolutize(getAttrVal(n.Attr, "action"), PackageRemoteHost)}, 85 } 86 n.AppendChild(hidFld) 87 88 submt := new(html.Node) 89 submt.Type = html.ElementNode 90 submt.Data = "input" 91 submt.Attr = []html.Attribute{ 92 html.Attribute{Key: "type", Val: "submit"}, 93 html.Attribute{Key: "value", Val: "subm"}, 94 html.Attribute{Key: "accesskey", Val: "f"}, 95 } 96 n.AppendChild(submt) 97 98 n.Attr = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost) 99 100 } 101 if n.Type == html.ElementNode && n.Data == "script" { 102 for i := 0; i < len(n.Attr); i++ { 103 if n.Attr[i].Key == "src" { 104 n.Attr[i].Val = emptySrc 105 } 106 } 107 } 108 if n.Type == html.ElementNode && 109 (n.Data == "a" || n.Data == "img" || n.Data == "script" || n.Data == "style") { 110 111 s := fCondenseNode(n, 0) 112 //fmt.Printf("found %v\n", s) 113 textReplacement := new(html.Node) 114 textReplacement.Type = html.TextNode 115 textReplacement.Data = s 116 117 attrStore := []html.Attribute{} 118 if n.Data == "a" || n.Data == "img" { 119 attrStore = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost) 120 } 121 if n.Data == "img" { 122 n.Data = "a" 123 } 124 if n.Data == "a" { 125 n.Attr = attrStore 126 } 127 128 // We want to remove all existing children. 129 // Direct loop impossible, since "NextSibling" is set to nil by Remove(). 130 // Therefore first assembling separately, then removing. 131 children := make(map[*html.Node]struct{}) 132 for c := n.FirstChild; c != nil; c = c.NextSibling { 133 children[c] = struct{}{} 134 } 135 for k, _ := range children { 136 n.RemoveChild(k) 137 } 138 139 // we can't put our replacement "under" an image, since img cannot have children 140 if n.Type == html.ElementNode && n.Data == "img" { 141 // n.Parent.InsertBefore(textReplacement,n) 142 dom.InsertAfter(n, textReplacement) 143 dom.RemoveNode(n) 144 145 } else { 146 n.AppendChild(textReplacement) 147 } 148 149 // Insert a || and a newline before every <a...> 150 if n.Data == "a" { 151 prev := n 152 153 breaker0 := dom.Nd("text", "||") 154 n.Parent.InsertBefore(breaker0, prev) 155 156 breaker1 := dom.Nd("br") 157 n.Parent.InsertBefore(breaker1, prev) 158 159 breaker2 := dom.Nd("text", "\n") 160 n.Parent.InsertBefore(breaker2, prev) 161 } 162 163 } 164 for c := n.FirstChild; c != nil; c = c.NextSibling { 165 fRecurse(c) 166 } 167 } 168 169 // -------------------------- 170 // ---------------------- 171 var docRoot *html.Node 172 var err error 173 rdr := strings.NewReader(s) 174 docRoot, err = html.Parse(rdr) 175 if err != nil { 176 panic(fmt.Sprintf("3 %v \n", err)) 177 } 178 179 fRecurse(docRoot) 180 181 var b bytes.Buffer 182 err = html.Render(&b, docRoot) 183 if err != nil { 184 panic(fmt.Sprintf("4 %v \n", err)) 185 } 186 // log.Printf("len is %v\n", b.Len()) 187 188 return b.String() 189 } 190 191 func init() { 192 193 }