github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean1/1_cleanup.go (about)

     1  // Package domclean1 normalizes html dom trees in a primitive way.
     2  package domclean1
     3  
     4  import (
     5  	"bytes"
     6  	"fmt"
     7  	"net/http"
     8  	"net/url"
     9  	"strings"
    10  
    11  	"github.com/pbberlin/tools/net/http/dom"
    12  	"github.com/pbberlin/tools/net/http/fetch"
    13  	"golang.org/x/net/html"
    14  )
    15  
    16  var fCondenseNode func(*html.Node, int) string
    17  var fRecurse func(*html.Node)
    18  
    19  const emptySrc = "//:0"
    20  
    21  // r is the request to the proxy
    22  // u is the url, that the proxy has called
    23  func ModifyHTML(r *http.Request, u *url.URL, s string) string {
    24  
    25  	var nums int // counter
    26  
    27  	// needed to get the current request into the
    28  	// "static" recursive functions
    29  	var PackageProxyHost = r.Host // port included!
    30  	var PackageRemoteHost = fetch.HostFromUrl(u)
    31  
    32  	fCondenseNode = func(n *html.Node, depth int) (ret string) {
    33  
    34  		if n.Type == html.ElementNode && n.Data == "script" {
    35  			ret += fmt.Sprintf(" var script%v = '[script]'; ", nums)
    36  			nums++
    37  			return
    38  		}
    39  		if n.Type == html.ElementNode && n.Data == "style" {
    40  			ret += fmt.Sprintf(" .xxx {margin:2px;} ")
    41  			return
    42  		}
    43  
    44  		if n.Type == html.ElementNode && n.Data == "img" {
    45  			ret += fmt.Sprintf(" [img] %v %v | ", getAttrVal(n.Attr, "alt"), getAttrVal(n.Attr, "src"))
    46  		}
    47  
    48  		if n.Type == html.ElementNode && n.Data == "a" {
    49  			ret += "[a]"
    50  		}
    51  
    52  		if n.Type == html.TextNode {
    53  			s := n.Data
    54  			// s = replTabsNewline.Replace(s)
    55  			// s = strings.TrimSpace(s)
    56  			if len(s) < 4 {
    57  				ret += s
    58  			} else if s != "" {
    59  				if depth > 0 {
    60  					ret += fmt.Sprintf(" [txt%v] %v", depth, s)
    61  				} else {
    62  					ret += " [txt] " + s
    63  				}
    64  			}
    65  		}
    66  
    67  		for c := n.FirstChild; c != nil; c = c.NextSibling {
    68  			ret += fCondenseNode(c, depth+1)
    69  		}
    70  		return
    71  	}
    72  
    73  	// --------------------------
    74  	// ----------------------
    75  
    76  	fRecurse = func(n *html.Node) {
    77  
    78  		if n.Type == html.ElementNode && n.Data == "form" {
    79  			hidFld := new(html.Node)
    80  			hidFld.Type = html.ElementNode
    81  			hidFld.Data = "input"
    82  			hidFld.Attr = []html.Attribute{
    83  				html.Attribute{Key: "name", Val: "redirect-to"},
    84  				html.Attribute{Key: "value", Val: absolutize(getAttrVal(n.Attr, "action"), PackageRemoteHost)},
    85  			}
    86  			n.AppendChild(hidFld)
    87  
    88  			submt := new(html.Node)
    89  			submt.Type = html.ElementNode
    90  			submt.Data = "input"
    91  			submt.Attr = []html.Attribute{
    92  				html.Attribute{Key: "type", Val: "submit"},
    93  				html.Attribute{Key: "value", Val: "subm"},
    94  				html.Attribute{Key: "accesskey", Val: "f"},
    95  			}
    96  			n.AppendChild(submt)
    97  
    98  			n.Attr = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost)
    99  
   100  		}
   101  		if n.Type == html.ElementNode && n.Data == "script" {
   102  			for i := 0; i < len(n.Attr); i++ {
   103  				if n.Attr[i].Key == "src" {
   104  					n.Attr[i].Val = emptySrc
   105  				}
   106  			}
   107  		}
   108  		if n.Type == html.ElementNode &&
   109  			(n.Data == "a" || n.Data == "img" || n.Data == "script" || n.Data == "style") {
   110  
   111  			s := fCondenseNode(n, 0)
   112  			//fmt.Printf("found %v\n", s)
   113  			textReplacement := new(html.Node)
   114  			textReplacement.Type = html.TextNode
   115  			textReplacement.Data = s
   116  
   117  			attrStore := []html.Attribute{}
   118  			if n.Data == "a" || n.Data == "img" {
   119  				attrStore = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost)
   120  			}
   121  			if n.Data == "img" {
   122  				n.Data = "a"
   123  			}
   124  			if n.Data == "a" {
   125  				n.Attr = attrStore
   126  			}
   127  
   128  			// We want to remove all existing children.
   129  			// Direct loop impossible, since "NextSibling" is set to nil by Remove().
   130  			// Therefore first assembling separately, then removing.
   131  			children := make(map[*html.Node]struct{})
   132  			for c := n.FirstChild; c != nil; c = c.NextSibling {
   133  				children[c] = struct{}{}
   134  			}
   135  			for k, _ := range children {
   136  				n.RemoveChild(k)
   137  			}
   138  
   139  			// we can't put our replacement "under" an image, since img cannot have children
   140  			if n.Type == html.ElementNode && n.Data == "img" {
   141  				// n.Parent.InsertBefore(textReplacement,n)
   142  				dom.InsertAfter(n, textReplacement)
   143  				dom.RemoveNode(n)
   144  
   145  			} else {
   146  				n.AppendChild(textReplacement)
   147  			}
   148  
   149  			// Insert a  || and a newline before every <a...>
   150  			if n.Data == "a" {
   151  				prev := n
   152  
   153  				breaker0 := dom.Nd("text", "||")
   154  				n.Parent.InsertBefore(breaker0, prev)
   155  
   156  				breaker1 := dom.Nd("br")
   157  				n.Parent.InsertBefore(breaker1, prev)
   158  
   159  				breaker2 := dom.Nd("text", "\n")
   160  				n.Parent.InsertBefore(breaker2, prev)
   161  			}
   162  
   163  		}
   164  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   165  			fRecurse(c)
   166  		}
   167  	}
   168  
   169  	// --------------------------
   170  	// ----------------------
   171  	var docRoot *html.Node
   172  	var err error
   173  	rdr := strings.NewReader(s)
   174  	docRoot, err = html.Parse(rdr)
   175  	if err != nil {
   176  		panic(fmt.Sprintf("3 %v \n", err))
   177  	}
   178  
   179  	fRecurse(docRoot)
   180  
   181  	var b bytes.Buffer
   182  	err = html.Render(&b, docRoot)
   183  	if err != nil {
   184  		panic(fmt.Sprintf("4 %v \n", err))
   185  	}
   186  	// log.Printf("len is %v\n", b.Len())
   187  
   188  	return b.String()
   189  }
   190  
   191  func init() {
   192  
   193  }