github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/dedup/t1_dedup_test.go (about) 1 // +build dedup1 2 // go test -tags=dedup1 3 4 package dedup 5 6 import ( 7 "path" 8 "testing" 9 10 "appengine/aetest" 11 12 "github.com/pbberlin/tools/net/http/domclean2" 13 "github.com/pbberlin/tools/net/http/fetch" 14 "github.com/pbberlin/tools/net/http/fileserver" 15 "github.com/pbberlin/tools/net/http/loghttp" 16 "github.com/pbberlin/tools/net/http/repo" 17 "github.com/pbberlin/tools/stringspb" 18 ) 19 20 func Test1(t *testing.T) { 21 22 lg, b := loghttp.BuffLoggerUniversal(nil, nil) 23 _ = b 24 25 c, err := aetest.NewContext(nil) 26 lg(err) 27 if err != nil { 28 return 29 } 30 defer c.Close() 31 fs := GetFS(c, 2) 32 33 remoteHostname := "www.welt.de" 34 remoteHostname = "www.welt.de/politik/ausland" 35 36 dirs1, _, msg, err := fileserver.GetDirContents(repo.RepoURL, remoteHostname) 37 if err != nil { 38 lg(err) 39 lg("%s", msg) 40 } 41 42 lg("dirs1") 43 for _, v := range dirs1 { 44 lg(" %v", v) 45 } 46 47 least3URLs := []string{} 48 for _, v1 := range dirs1 { 49 50 p := path.Join(remoteHostname, v1) 51 dirs2, fils2, msg, err := fileserver.GetDirContents(repo.RepoURL, p) 52 _ = dirs2 53 if err != nil { 54 lg(err) 55 lg("%s", msg) 56 } 57 // lg(" dirs2 %v", stringspb.IndentedDump(dirs2)) 58 // lg(" fils2 %v", stringspb.IndentedDump(fils2)) 59 60 for _, v2 := range fils2 { 61 least3URLs = append(least3URLs, path.Join(remoteHostname, v1, v2)) 62 } 63 } 64 65 if len(least3URLs) < numTotal { 66 lg("not enough files in rss fetcher cache") 67 return 68 } else { 69 least3URLs = least3URLs[:numTotal+1] 70 } 71 72 lg("fils2") 73 for _, v := range least3URLs { 74 lg(" %v", v) 75 } 76 77 // domclean 78 79 least3Files := make([]repo.FullArticle, 0, len(least3URLs)) 80 for i := 0; i < len(least3URLs); i++ { 81 82 surl := spf("%v/%v", repo.RepoURL, least3URLs[i]) 83 84 fNamer := domclean2.FileNamer(logDir, i) 85 fNamer() // first call yields key 86 87 resBytes, inf, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) 88 if err != nil { 89 lg(err) 90 return 91 } 92 lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(inf.URL.String(), 60)) 93 94 fa := repo.FullArticle{} 95 fa.Url = inf.URL.String() 96 fa.Mod = inf.Mod 97 fa.Body = resBytes 98 least3Files = append(least3Files, fa) 99 100 } 101 102 doc := Dedup(least3Files, lg, fs) 103 104 fNamer := domclean2.FileNamer(logDir, 0) 105 fNamer() // first call yields key 106 fsPerm := GetFS(c, 2) 107 fileDump(lg, fsPerm, doc, fNamer, "_fin.html") 108 109 pf("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare) 110 pf("correct finish\n") 111 112 }