github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/repo/8_fetch_similar.go (about)

     1  package repo
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/json"
     6  	"io/ioutil"
     7  	"net/http"
     8  	"path"
     9  	"strconv"
    10  	"strings"
    11  	"time"
    12  
    13  	"github.com/pbberlin/tools/distrib"
    14  	"github.com/pbberlin/tools/net/http/fetch"
    15  	"github.com/pbberlin/tools/net/http/loghttp"
    16  	"github.com/pbberlin/tools/net/http/routes"
    17  	"github.com/pbberlin/tools/net/http/tplx"
    18  	"github.com/pbberlin/tools/os/fsi"
    19  	"github.com/pbberlin/tools/stringspb"
    20  	"google.golang.org/appengine"
    21  )
    22  
    23  type MyWorker struct {
    24  	SURL     string
    25  	Protocol string
    26  
    27  	r *http.Request
    28  
    29  	lg loghttp.FuncBufUniv
    30  
    31  	fs1 fsi.FileSystem
    32  
    33  	err error
    34  	FA  *FullArticle
    35  }
    36  
    37  func (m *MyWorker) Work() {
    38  
    39  	bts, mod, _, err := fetchSave(m)
    40  
    41  	if err != nil {
    42  		m.err = err
    43  		return
    44  	}
    45  
    46  	if len(bts) < 200 {
    47  		if bytes.Contains(bts, []byte(fetch.MsgNoRdirects)) {
    48  			return
    49  		}
    50  	}
    51  
    52  	m.FA = &FullArticle{}
    53  	m.FA.Mod = mod
    54  	m.FA.Body = bts
    55  	m.FA.Url = m.SURL
    56  
    57  }
    58  
    59  // FetchSimilar is an extended version of Fetch
    60  // It is uses a DirTree of crawled *links*, not actual files.
    61  // As it moves up the DOM, it crawls every document for additional links.
    62  // It first moves up to find similar URLs on the same depth
    63  //                        /\
    64  //          /\           /  \
    65  //    /\   /  \         /    \
    66  // It then moves up the ladder again - to accept higher URLs
    67  //                        /\
    68  //          /\
    69  //    /\
    70  func FetchSimilar(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {
    71  
    72  	lg, b := loghttp.BuffLoggerUniversal(w, r)
    73  	closureOverBuf := func(bUnused *bytes.Buffer) {
    74  		loghttp.Pf(w, r, b.String())
    75  	}
    76  	defer closureOverBuf(b) // the argument is ignored,
    77  
    78  	r.Header.Set("X-Custom-Header-Counter", "nocounter")
    79  
    80  	start := time.Now()
    81  
    82  	wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Find similar HTML URLs"}))
    83  	defer wpf(b, tplx.Foot)
    84  
    85  	wpf(b, "<pre>")
    86  	defer wpf(b, "</pre>")
    87  
    88  	fs1 := GetFS(appengine.NewContext(r))
    89  
    90  	err := r.ParseForm()
    91  	lg(err)
    92  
    93  	countSimilar := 3
    94  	sCountSimilar := r.FormValue("cnt")
    95  	if sCountSimilar != "" {
    96  		i, err := strconv.Atoi(strings.TrimSpace(sCountSimilar))
    97  		if err == nil {
    98  			countSimilar = i
    99  		}
   100  	}
   101  
   102  	surl := r.FormValue(routes.URLParamKey)
   103  	ourl, err := fetch.URLFromString(surl)
   104  	lg(err)
   105  	if err != nil {
   106  		return
   107  	}
   108  	if ourl.Host == "" {
   109  		lg("host is empty (%v)", surl)
   110  		return
   111  	}
   112  
   113  	knownProtocol := ""
   114  	if r.FormValue("prot") != "" {
   115  		knownProtocol = r.FormValue("prot")
   116  	}
   117  
   118  	numWorkers := 0
   119  	sNumWorkers := r.FormValue("numworkers")
   120  	if sNumWorkers != "" {
   121  		i, err := strconv.Atoi(strings.TrimSpace(sNumWorkers))
   122  		if err == nil {
   123  			numWorkers = i
   124  		}
   125  	}
   126  
   127  	srcDepth := strings.Count(ourl.Path, "/")
   128  
   129  	cmd := FetchCommand{}
   130  	cmd.Host = ourl.Host
   131  	cmd.SearchPrefix = ourl.Path
   132  	cmd = addDefaults(cmd)
   133  
   134  	dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true}
   135  	fnDigest := path.Join(docRoot, cmd.Host, "digest2.json")
   136  	loadDigest(w, r, lg, fs1, fnDigest, dirTree) // previous
   137  	lg("dirtree 400 chars is %v end of dirtree\t\t", stringspb.ToLen(dirTree.String(), 400))
   138  
   139  	m1 := new(MyWorker)
   140  	m1.r = r
   141  	m1.lg = lg
   142  	m1.fs1 = fs1
   143  	m1.SURL = path.Join(cmd.Host, ourl.Path)
   144  	m1.Protocol = knownProtocol
   145  	btsSrc, modSrc, usedExisting, err := fetchSave(m1)
   146  	if !usedExisting {
   147  		addAnchors(lg, cmd.Host, btsSrc, dirTree)
   148  	}
   149  	lg(err)
   150  	if err != nil {
   151  		return
   152  	}
   153  
   154  	lg("\t\t%4.2v secs so far 1", time.Now().Sub(start).Seconds())
   155  
   156  	var treePath string
   157  	treePath = "/blogs/freeexchange"
   158  	treePath = "/news/europe"
   159  	treePath = path.Dir(ourl.Path)
   160  
   161  	opt := LevelWiseDeeperOptions{}
   162  	opt.Rump = treePath
   163  	opt.ExcludeDir = "/news/americas"
   164  	opt.ExcludeDir = "/blogs/buttonwood"
   165  	opt.ExcludeDir = "/something-impossible"
   166  	opt.MinDepthDiff = 1
   167  	opt.MaxDepthDiff = 1
   168  	opt.CondenseTrailingDirs = cmd.CondenseTrailingDirs
   169  	opt.MaxNumber = cmd.DesiredNumber + 1  // one more for "self"
   170  	opt.MaxNumber = cmd.DesiredNumber + 40 // collect more, 'cause we filter out those too old later
   171  
   172  	var subtree *DirTree
   173  	links := []FullArticle{}
   174  
   175  	alreadyCrawled := map[string]struct{}{}
   176  
   177  MarkOuter:
   178  	for j := 0; j < srcDepth; j++ {
   179  		treePath = path.Dir(ourl.Path)
   180  	MarkInner:
   181  		// for i := 1; i < srcDepth; i++ {
   182  		for i := 1; i < (srcDepth + 5); i++ {
   183  
   184  			subtree, treePath = DiveToDeepestMatch(dirTree, treePath)
   185  
   186  			lg("Looking from height %v to level %v  - %v", srcDepth-i, srcDepth-j, treePath)
   187  
   188  			if _, ok := alreadyCrawled[treePath]; ok {
   189  				// lg("\t already digested %v", treePath)
   190  				continue
   191  			}
   192  
   193  			m2 := new(MyWorker)
   194  			m2.r = r
   195  			m2.lg = lg
   196  			m2.fs1 = fs1
   197  			m2.SURL = path.Join(cmd.Host, treePath)
   198  			m2.Protocol = knownProtocol
   199  
   200  			btsPar, _, usedExisting, err := fetchSave(m2)
   201  			lg(err)
   202  			if err != nil {
   203  				return
   204  			}
   205  			alreadyCrawled[treePath] = struct{}{}
   206  			if !usedExisting {
   207  				addAnchors(lg, cmd.Host, btsPar, dirTree)
   208  			}
   209  
   210  			if subtree == nil {
   211  				lg("\n#%v treePath %q ; subtree is nil", i, treePath)
   212  			} else {
   213  				// lg("\n#%v treePath %q ; subtree exists", i, treePath)
   214  
   215  				opt.Rump = treePath
   216  				opt.MinDepthDiff = i - j
   217  				opt.MaxDepthDiff = i - j
   218  				lvlLinks := LevelWiseDeeper(nil, nil, subtree, opt)
   219  				links = append(links, lvlLinks...)
   220  				for _, art := range lvlLinks {
   221  					_ = art
   222  					// lg("#%v fnd    %v", i, stringspb.ToLen(art.Url, 100))
   223  				}
   224  
   225  				if len(links) >= opt.MaxNumber {
   226  					lg("found enough links")
   227  					break MarkOuter
   228  				}
   229  
   230  				pathPrev := treePath
   231  				treePath = path.Dir(treePath)
   232  				// lg("#%v  bef %v - aft %v", i, pathPrev, treePath)
   233  
   234  				if pathPrev == "." && treePath == "." ||
   235  					pathPrev == "/" && treePath == "/" ||
   236  					pathPrev == "" && treePath == "." {
   237  					lg("break to innner")
   238  					break MarkInner
   239  				}
   240  			}
   241  
   242  		}
   243  	}
   244  
   245  	//
   246  	//
   247  	//
   248  	//
   249  	lg("%v links after %4.2v secs", len(links), time.Now().Sub(start).Seconds())
   250  
   251  	lg("============================")
   252  	lg("Now reading/fetching actual similar files - not just the links")
   253  	//
   254  	tried := 0
   255  	selecteds := []FullArticle{}
   256  
   257  	nonExisting := []FullArticle{}
   258  	nonExistFetched := []FullArticle{}
   259  
   260  	for _, art := range links {
   261  
   262  		if art.Url == ourl.Path {
   263  			lg("skipping self\t%v", art.Url)
   264  			continue
   265  		}
   266  
   267  		tried++
   268  
   269  		useExisting := false
   270  
   271  		semanticUri := condenseTrailingDir(art.Url, cmd.CondenseTrailingDirs)
   272  		p := path.Join(docRoot, cmd.Host, semanticUri)
   273  
   274  		f, err := fs1.Open(p)
   275  		// lg(err) // its no error if file does not exist
   276  		if err != nil {
   277  			// lg("!nstore %q", semanticUri)
   278  		} else {
   279  			// lg("reading %q", semanticUri)
   280  
   281  			// lets put this into a func, so that f.close it called at the end of this func
   282  			// otherwise defer f.close() spans the entire func and prevents
   283  			// overwrites chmods further down
   284  			f := func() {
   285  				defer f.Close()
   286  				fi, err := f.Stat()
   287  				lg(err)
   288  				if err != nil {
   289  
   290  				} else {
   291  					age := time.Now().Sub(fi.ModTime())
   292  					if age.Hours() < 10 {
   293  						lg("\t\tusing existing file with age %4.2v hrs", age.Hours())
   294  						art.Mod = fi.ModTime()
   295  						bts, err := ioutil.ReadAll(f)
   296  						lg(err)
   297  						art.Body = bts
   298  						if len(bts) < 200 {
   299  							if bytes.Contains(bts, []byte(fetch.MsgNoRdirects)) {
   300  								return
   301  							}
   302  						}
   303  						selecteds = append(selecteds, art)
   304  						useExisting = true
   305  					}
   306  				}
   307  			}
   308  			f()
   309  
   310  		}
   311  
   312  		if !useExisting {
   313  			nonExisting = append(nonExisting, art)
   314  		}
   315  
   316  		if len(selecteds) >= countSimilar {
   317  			break
   318  		}
   319  
   320  	}
   321  	lg("============================")
   322  	lg("tried %v links - yielding %v existing similars; not existing in datastore: %v, %v were requested.",
   323  		tried, len(selecteds), len(nonExisting), countSimilar)
   324  
   325  	if len(selecteds) < countSimilar {
   326  		jobs := make([]distrib.Worker, 0, len(nonExisting))
   327  		for _, art := range nonExisting {
   328  			surl := path.Join(cmd.Host, art.Url)
   329  			wrkr := MyWorker{SURL: surl}
   330  			wrkr.Protocol = knownProtocol
   331  			wrkr.r = r
   332  			wrkr.lg = lg
   333  			wrkr.fs1 = fs1
   334  			job := distrib.Worker(&wrkr)
   335  			jobs = append(jobs, job)
   336  		}
   337  
   338  		opt := distrib.NewDefaultOptions()
   339  		opt.TimeOutDur = 3500 * time.Millisecond
   340  		opt.Want = int32(countSimilar - len(selecteds) + 4) // get some more, in case we have "redirected" bodies
   341  		opt.NumWorkers = int(opt.Want)                      // 5s query limit; => hurry; spawn as many as we want
   342  		if numWorkers > 0 {
   343  			opt.NumWorkers = numWorkers
   344  		}
   345  		lg("Preparing %v simultaneous, wanting %v fetches; at %4.2v secs.", opt.NumWorkers, opt.Want, time.Now().Sub(start).Seconds())
   346  		opt.CollectRemainder = false // 5s query limit; => hurry; dont wait for stragglers
   347  
   348  		ret, msg := distrib.Distrib(jobs, opt)
   349  		lg("Distrib returned at %4.2v secs with %v results.", time.Now().Sub(start).Seconds(), len(ret))
   350  
   351  		lg("\n" + msg.String())
   352  		for _, v := range ret {
   353  			v1, _ := v.Worker.(*MyWorker)
   354  			if v1.FA != nil {
   355  				age := time.Now().Sub(v1.FA.Mod)
   356  				if age.Hours() < 10 {
   357  					lg("\t\tusing fetched file with age %4.2v hrs", age.Hours())
   358  					nonExistFetched = append(nonExistFetched, *v1.FA)
   359  					if len(nonExistFetched) > (countSimilar - len(selecteds)) {
   360  						break
   361  					}
   362  				}
   363  			}
   364  			if v1.err != nil {
   365  				lg(err)
   366  			}
   367  		}
   368  
   369  		lg("tried %v links - yielding %v fetched - jobs %v", len(nonExisting), len(nonExistFetched), len(jobs))
   370  		selecteds = append(selecteds, nonExistFetched...)
   371  
   372  		//
   373  		//
   374  		// Extract links
   375  		for _, v := range nonExistFetched {
   376  			// lg("links -> memory dirtree for %q", v.Url)
   377  			addAnchors(lg, cmd.Host, v.Body, dirTree)
   378  		}
   379  
   380  	}
   381  
   382  	//
   383  	if time.Now().Sub(dirTree.LastFound).Seconds() < 10 {
   384  		lg("saving accumulated (new) links to digest")
   385  		saveDigest(lg, fs1, fnDigest, dirTree)
   386  	}
   387  
   388  	lg("\t\t%4.2v secs so far 3", time.Now().Sub(start).Seconds())
   389  
   390  	mp := map[string][]byte{}
   391  	mp["msg"] = b.Bytes()
   392  	mp["url_self"] = []byte(condenseTrailingDir(ourl.Path, cmd.CondenseTrailingDirs))
   393  	mp["mod_self"] = []byte(modSrc.Format(http.TimeFormat))
   394  	mp["bod_self"] = btsSrc
   395  
   396  	for i, v := range selecteds {
   397  		mp["url__"+spf("%02v", i)] = []byte(v.Url)
   398  		mp["mod__"+spf("%02v", i)] = []byte(v.Mod.Format(http.TimeFormat))
   399  		mp["bod__"+spf("%02v", i)] = v.Body
   400  	}
   401  
   402  	mp["lensimilar"] = []byte(spf("%02v", len(selecteds)))
   403  
   404  	//
   405  	smp, err := json.MarshalIndent(mp, "", "\t")
   406  	if err != nil {
   407  		lg(b, "marshalling mp to []byte failed\n")
   408  		return
   409  	}
   410  
   411  	r.Header.Set("X-Custom-Header-Counter", "nocounter")
   412  	w.Header().Set("Content-Type", "application/json")
   413  	w.Write(smp)
   414  
   415  	b.Reset()             // this keeps the  buf pointer intact; outgoing defers are still heeded
   416  	b = new(bytes.Buffer) // creates a *new* buf pointer; outgoing defers write into the *old* buf
   417  
   418  	lg("\t\t%4.2v secs so far 4 (json resp written as []byte)", time.Now().Sub(start).Seconds())
   419  
   420  	return
   421  
   422  }