github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/dedup/t1_dedup_test.go (about)

     1  // +build dedup1
     2  // go test -tags=dedup1
     3  
     4  package dedup
     5  
     6  import (
     7  	"path"
     8  	"testing"
     9  
    10  	"appengine/aetest"
    11  
    12  	"github.com/pbberlin/tools/net/http/domclean2"
    13  	"github.com/pbberlin/tools/net/http/fetch"
    14  	"github.com/pbberlin/tools/net/http/fileserver"
    15  	"github.com/pbberlin/tools/net/http/loghttp"
    16  	"github.com/pbberlin/tools/net/http/repo"
    17  	"github.com/pbberlin/tools/stringspb"
    18  )
    19  
    20  func Test1(t *testing.T) {
    21  
    22  	lg, b := loghttp.BuffLoggerUniversal(nil, nil)
    23  	_ = b
    24  
    25  	c, err := aetest.NewContext(nil)
    26  	lg(err)
    27  	if err != nil {
    28  		return
    29  	}
    30  	defer c.Close()
    31  	fs := GetFS(c, 2)
    32  
    33  	remoteHostname := "www.welt.de"
    34  	remoteHostname = "www.welt.de/politik/ausland"
    35  
    36  	dirs1, _, msg, err := fileserver.GetDirContents(repo.RepoURL, remoteHostname)
    37  	if err != nil {
    38  		lg(err)
    39  		lg("%s", msg)
    40  	}
    41  
    42  	lg("dirs1")
    43  	for _, v := range dirs1 {
    44  		lg("    %v", v)
    45  	}
    46  
    47  	least3URLs := []string{}
    48  	for _, v1 := range dirs1 {
    49  
    50  		p := path.Join(remoteHostname, v1)
    51  		dirs2, fils2, msg, err := fileserver.GetDirContents(repo.RepoURL, p)
    52  		_ = dirs2
    53  		if err != nil {
    54  			lg(err)
    55  			lg("%s", msg)
    56  		}
    57  		// lg("  dirs2 %v", stringspb.IndentedDump(dirs2))
    58  		// lg("  fils2 %v", stringspb.IndentedDump(fils2))
    59  
    60  		for _, v2 := range fils2 {
    61  			least3URLs = append(least3URLs, path.Join(remoteHostname, v1, v2))
    62  		}
    63  	}
    64  
    65  	if len(least3URLs) < numTotal {
    66  		lg("not enough files in rss fetcher cache")
    67  		return
    68  	} else {
    69  		least3URLs = least3URLs[:numTotal+1]
    70  	}
    71  
    72  	lg("fils2")
    73  	for _, v := range least3URLs {
    74  		lg("    %v", v)
    75  	}
    76  
    77  	// domclean
    78  
    79  	least3Files := make([]repo.FullArticle, 0, len(least3URLs))
    80  	for i := 0; i < len(least3URLs); i++ {
    81  
    82  		surl := spf("%v/%v", repo.RepoURL, least3URLs[i])
    83  
    84  		fNamer := domclean2.FileNamer(logDir, i)
    85  		fNamer() // first call yields key
    86  
    87  		resBytes, inf, err := fetch.UrlGetter(nil, fetch.Options{URL: surl})
    88  		if err != nil {
    89  			lg(err)
    90  			return
    91  		}
    92  		lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(inf.URL.String(), 60))
    93  
    94  		fa := repo.FullArticle{}
    95  		fa.Url = inf.URL.String()
    96  		fa.Mod = inf.Mod
    97  		fa.Body = resBytes
    98  		least3Files = append(least3Files, fa)
    99  
   100  	}
   101  
   102  	doc := Dedup(least3Files, lg, fs)
   103  
   104  	fNamer := domclean2.FileNamer(logDir, 0)
   105  	fNamer() // first call yields key
   106  	fsPerm := GetFS(c, 2)
   107  	fileDump(lg, fsPerm, doc, fNamer, "_fin.html")
   108  
   109  	pf("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare)
   110  	pf("correct finish\n")
   111  
   112  }