github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/domclean2/t_parsing_test.go (about) 1 // +build parsing 2 // go test -tags=parsing 3 4 package domclean2 5 6 import ( 7 "fmt" 8 "log" 9 "net/http" 10 "os" 11 "path" 12 "testing" 13 "time" 14 15 "appengine/aetest" 16 17 "github.com/pbberlin/tools/net/http/fetch" 18 "github.com/pbberlin/tools/net/http/fileserver" 19 "github.com/pbberlin/tools/net/http/loghttp" 20 "github.com/pbberlin/tools/net/http/repo" 21 "github.com/pbberlin/tools/net/http/routes" 22 "github.com/pbberlin/tools/sort/sortmap" 23 "github.com/pbberlin/tools/stringspb" 24 ) 25 26 const numTotal = 3 // comparable html docs 27 const stageMax = 3 // weedstages 28 29 const cTestHostOwn = "localhost:63222" 30 31 var hostWithPref = routes.AppHost() + repo.UriMountNameY 32 33 func prepare(t *testing.T) aetest.Context { 34 35 lg, lge := loghttp.Logger(nil, nil) 36 _ = lg 37 38 c, err := aetest.NewContext(nil) 39 if err != nil { 40 lge(err) 41 t.Fatal(err) 42 } 43 44 serveFile := func(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { 45 fs1 := repo.GetFS(c) 46 fileserver.FsiFileServer(w, r, fileserver.Options{FS: fs1, Prefix: repo.UriMountNameY}) 47 } 48 http.HandleFunc(repo.UriMountNameY, loghttp.Adapter(serveFile)) 49 50 go func() { 51 log.Fatal( 52 http.ListenAndServe(cTestHostOwn, nil), 53 ) 54 }() 55 56 return c 57 58 } 59 60 func Test1(t *testing.T) { 61 62 lg, lge := loghttp.Logger(nil, nil) 63 64 // c := prepare(t) 65 // defer c.Close() 66 67 lg("waiting for webserver") 68 time.Sleep(2 * time.Millisecond) 69 70 remoteHostname := "www.welt.de" 71 72 dirs1, _, msg, err := fileserver.GetDirContents(hostWithPref, remoteHostname) 73 if err != nil { 74 lge(err) 75 lg("%s", msg) 76 } 77 78 lg("dirs1") 79 for _, v := range dirs1 { 80 lg(" %v", v) 81 } 82 83 least3Files := []string{} 84 for _, v1 := range dirs1 { 85 86 dirs2, fils2, msg, err := fileserver.GetDirContents(hostWithPref, path.Join(remoteHostname, v1)) 87 _ = dirs2 88 if err != nil { 89 lge(err) 90 lg("%s", msg) 91 } 92 // lg(" dirs2 %v", stringspb.IndentedDump(dirs2)) 93 // lg(" fils2 %v", stringspb.IndentedDump(fils2)) 94 95 if len(fils2) > numTotal-1 { 96 for i2, v2 := range fils2 { 97 least3Files = append(least3Files, path.Join(remoteHostname, v1, v2)) 98 if i2 == numTotal-1 { 99 break 100 } 101 } 102 break 103 } 104 } 105 106 if len(least3Files) < numTotal { 107 lg("not enough files in rss fetcher cache") 108 return 109 } 110 111 lg("fils2") 112 for _, v := range least3Files { 113 lg(" %v", v) 114 } 115 116 logdir := prepareLogDir() 117 118 iter := make([]int, numTotal) 119 120 for i, _ := range iter { 121 122 surl := spf("%v/%v", hostWithPref, least3Files[i]) 123 124 fNamer := FileNamer(logdir, i) 125 fnKey := fNamer() // first call yields key 126 _ = fnKey 127 128 resBytes, effUrl, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) 129 if err != nil { 130 lge(err) 131 return 132 } 133 lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(effUrl.String(), 60)) 134 opts := CleaningOptions{Proxify: true} 135 opts.FNamer = fNamer 136 opts.RemoteHost = remoteHostname 137 doc, err := DomClean(resBytes, opts) 138 lge(err) 139 _ = doc 140 141 } 142 143 // statistics on elements and attributes 144 sorted1 := sortmap.SortMapByCount(attrDistinct) 145 sorted1.Print(6) 146 fmt.Println() 147 sorted2 := sortmap.SortMapByCount(nodeDistinct) 148 sorted2.Print(6) 149 150 pf("correct finish\n") 151 152 } 153 154 func prepareLogDir() string { 155 156 lg, lge := loghttp.Logger(nil, nil) 157 158 logdir := "outp" 159 lg("logdir is %v ", logdir) 160 161 // sweep previous 162 rmPath := spf("./%v/", logdir) 163 err := os.RemoveAll(rmPath) 164 if err != nil { 165 lge(err) 166 os.Exit(1) 167 } 168 lg("removed %q", rmPath) 169 170 // create anew 171 err = os.Mkdir(logdir, 0755) 172 if err != nil && !os.IsExist(err) { 173 lge(err) 174 os.Exit(1) 175 } 176 177 return logdir 178 179 }