github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean2/00_pipeline.go (about)

     1  package domclean2
     2  
     3  import (
     4  	"bytes"
     5  	"net/url"
     6  	"path/filepath"
     7  
     8  	"github.com/pbberlin/tools/net/http/loghttp"
     9  	"github.com/pbberlin/tools/net/http/routes"
    10  	"github.com/pbberlin/tools/os/osutilpb"
    11  	"golang.org/x/net/html"
    12  )
    13  
    14  type CleaningOptions struct {
    15  	FNamer func() string
    16  
    17  	Proxify    bool
    18  	ProxyHost  string
    19  	RemoteHost string
    20  
    21  	AddOutline bool
    22  	AddID      bool
    23  
    24  	Beautify bool // make pretty at the end, removes <a> linktext trailing space
    25  }
    26  
    27  func FileNamer(logdir string, fileNumber int) func() string {
    28  	cntr := -2
    29  	return func() string {
    30  		cntr++
    31  		if cntr == -1 {
    32  			return spf("outp_%03v", fileNumber) // prefix/filekey
    33  		} else {
    34  			fn := spf("outp_%03v_%v", fileNumber, cntr) // filename with stage
    35  			fn = filepath.Join(logdir, fn)
    36  			return fn
    37  		}
    38  	}
    39  }
    40  
    41  func globFixes(b []byte) []byte {
    42  	// <!--(.*?)-->
    43  
    44  	b = bytes.Replace(b, []byte("<!--<![endif]-->"), []byte("<![endif]-->"), -1)
    45  	return b
    46  }
    47  
    48  func fileDump(doc *html.Node, fNamer func() string) {
    49  	if fNamer != nil {
    50  		removeCommentsAndIntertagWhitespace(NdX{doc, 0})
    51  		reIndent(doc, 0)
    52  		osutilpb.Dom2File(fNamer()+".html", doc)
    53  		removeCommentsAndIntertagWhitespace(NdX{doc, 0})
    54  	}
    55  }
    56  
    57  func DomClean(b []byte, opt CleaningOptions) (*html.Node, error) {
    58  
    59  	lg, lge := loghttp.Logger(nil, nil)
    60  	_ = lg
    61  
    62  	b = globFixes(b)
    63  	doc, err := html.Parse(bytes.NewReader(b))
    64  	if err != nil {
    65  		lge(err)
    66  		return nil, err
    67  	}
    68  
    69  	if opt.FNamer != nil {
    70  		osutilpb.Dom2File(opt.FNamer()+".html", doc)
    71  	}
    72  
    73  	//
    74  	//
    75  	cleanseDom(doc, 0)
    76  	removeCommentsAndIntertagWhitespace(NdX{doc, 0})
    77  	fileDump(doc, opt.FNamer)
    78  
    79  	//
    80  	//
    81  	condenseTopDown(doc, 0, 0)
    82  	removeEmptyNodes(doc, 0)
    83  	fileDump(doc, opt.FNamer)
    84  
    85  	//
    86  	//
    87  	removeCommentsAndIntertagWhitespace(NdX{doc, 0}) // prevent spacey textnodes around singl child images
    88  	breakoutImagesFromAnchorTrees(doc)
    89  	recurseImg2Link(doc)
    90  	fileDump(doc, opt.FNamer)
    91  
    92  	//
    93  	//
    94  	condenseBottomUpV3(doc, 0, 7, map[string]bool{"div": true})
    95  	condenseBottomUpV3(doc, 0, 6, map[string]bool{"div": true})
    96  	condenseBottomUpV3(doc, 0, 5, map[string]bool{"div": true})
    97  	condenseBottomUpV3(doc, 0, 4, map[string]bool{"div": true})
    98  	condenseTopDown(doc, 0, 0)
    99  
   100  	removeEmptyNodes(doc, 0)
   101  	removeEmptyNodes(doc, 0)
   102  
   103  	fileDump(doc, opt.FNamer)
   104  
   105  	//
   106  	//
   107  	if opt.Proxify {
   108  		if opt.ProxyHost == "" {
   109  			opt.ProxyHost = routes.AppHost()
   110  		}
   111  
   112  		proxify(doc, opt.ProxyHost, &url.URL{Scheme: "http", Host: opt.RemoteHost})
   113  		fileDump(doc, opt.FNamer)
   114  	}
   115  
   116  	if opt.Beautify {
   117  		removeCommentsAndIntertagWhitespace(NdX{doc, 0})
   118  		reIndent(doc, 0)
   119  
   120  	}
   121  
   122  	//
   123  	//
   124  	if opt.AddOutline {
   125  		addOutlineAttr(doc, 0, []int{0})
   126  	}
   127  	if opt.AddID {
   128  		addIdAttr(doc, 0, 1)
   129  	}
   130  	if opt.AddOutline || opt.AddID {
   131  		fileDump(doc, opt.FNamer)
   132  	}
   133  
   134  	//
   135  	computeXPathStack(doc, 0)
   136  	if opt.FNamer != nil {
   137  		osutilpb.Bytes2File(opt.FNamer()+".txt", xPathDump)
   138  	}
   139  
   140  	return doc, nil
   141  
   142  }
   143  
   144  func DomFormat(doc *html.Node) {
   145  	removeEmptyNodes(doc, 0)
   146  	removeCommentsAndIntertagWhitespace(NdX{doc, 0})
   147  	reIndent(doc, 0)
   148  }