github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean2/00_pipeline.go (about) 1 package domclean2 2 3 import ( 4 "bytes" 5 "net/url" 6 "path/filepath" 7 8 "github.com/pbberlin/tools/net/http/loghttp" 9 "github.com/pbberlin/tools/net/http/routes" 10 "github.com/pbberlin/tools/os/osutilpb" 11 "golang.org/x/net/html" 12 ) 13 14 type CleaningOptions struct { 15 FNamer func() string 16 17 Proxify bool 18 ProxyHost string 19 RemoteHost string 20 21 AddOutline bool 22 AddID bool 23 24 Beautify bool // make pretty at the end, removes <a> linktext trailing space 25 } 26 27 func FileNamer(logdir string, fileNumber int) func() string { 28 cntr := -2 29 return func() string { 30 cntr++ 31 if cntr == -1 { 32 return spf("outp_%03v", fileNumber) // prefix/filekey 33 } else { 34 fn := spf("outp_%03v_%v", fileNumber, cntr) // filename with stage 35 fn = filepath.Join(logdir, fn) 36 return fn 37 } 38 } 39 } 40 41 func globFixes(b []byte) []byte { 42 // <!--(.*?)--> 43 44 b = bytes.Replace(b, []byte("<!--<![endif]-->"), []byte("<![endif]-->"), -1) 45 return b 46 } 47 48 func fileDump(doc *html.Node, fNamer func() string) { 49 if fNamer != nil { 50 removeCommentsAndIntertagWhitespace(NdX{doc, 0}) 51 reIndent(doc, 0) 52 osutilpb.Dom2File(fNamer()+".html", doc) 53 removeCommentsAndIntertagWhitespace(NdX{doc, 0}) 54 } 55 } 56 57 func DomClean(b []byte, opt CleaningOptions) (*html.Node, error) { 58 59 lg, lge := loghttp.Logger(nil, nil) 60 _ = lg 61 62 b = globFixes(b) 63 doc, err := html.Parse(bytes.NewReader(b)) 64 if err != nil { 65 lge(err) 66 return nil, err 67 } 68 69 if opt.FNamer != nil { 70 osutilpb.Dom2File(opt.FNamer()+".html", doc) 71 } 72 73 // 74 // 75 cleanseDom(doc, 0) 76 removeCommentsAndIntertagWhitespace(NdX{doc, 0}) 77 fileDump(doc, opt.FNamer) 78 79 // 80 // 81 condenseTopDown(doc, 0, 0) 82 removeEmptyNodes(doc, 0) 83 fileDump(doc, opt.FNamer) 84 85 // 86 // 87 removeCommentsAndIntertagWhitespace(NdX{doc, 0}) // prevent spacey textnodes around singl child images 88 breakoutImagesFromAnchorTrees(doc) 89 recurseImg2Link(doc) 90 fileDump(doc, opt.FNamer) 91 92 // 93 // 94 condenseBottomUpV3(doc, 0, 7, map[string]bool{"div": true}) 95 condenseBottomUpV3(doc, 0, 6, map[string]bool{"div": true}) 96 condenseBottomUpV3(doc, 0, 5, map[string]bool{"div": true}) 97 condenseBottomUpV3(doc, 0, 4, map[string]bool{"div": true}) 98 condenseTopDown(doc, 0, 0) 99 100 removeEmptyNodes(doc, 0) 101 removeEmptyNodes(doc, 0) 102 103 fileDump(doc, opt.FNamer) 104 105 // 106 // 107 if opt.Proxify { 108 if opt.ProxyHost == "" { 109 opt.ProxyHost = routes.AppHost() 110 } 111 112 proxify(doc, opt.ProxyHost, &url.URL{Scheme: "http", Host: opt.RemoteHost}) 113 fileDump(doc, opt.FNamer) 114 } 115 116 if opt.Beautify { 117 removeCommentsAndIntertagWhitespace(NdX{doc, 0}) 118 reIndent(doc, 0) 119 120 } 121 122 // 123 // 124 if opt.AddOutline { 125 addOutlineAttr(doc, 0, []int{0}) 126 } 127 if opt.AddID { 128 addIdAttr(doc, 0, 1) 129 } 130 if opt.AddOutline || opt.AddID { 131 fileDump(doc, opt.FNamer) 132 } 133 134 // 135 computeXPathStack(doc, 0) 136 if opt.FNamer != nil { 137 osutilpb.Bytes2File(opt.FNamer()+".txt", xPathDump) 138 } 139 140 return doc, nil 141 142 } 143 144 func DomFormat(doc *html.Node) { 145 removeEmptyNodes(doc, 0) 146 removeCommentsAndIntertagWhitespace(NdX{doc, 0}) 147 reIndent(doc, 0) 148 }