github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/repo/4_fetch.go (about)

     1  package repo
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/json"
     6  	"net/http"
     7  	"net/url"
     8  	"path"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/pbberlin/tools/net/http/fetch"
    13  	"github.com/pbberlin/tools/net/http/loghttp"
    14  	"github.com/pbberlin/tools/os/fsi"
    15  	"github.com/pbberlin/tools/sort/sortmap"
    16  	"github.com/pbberlin/tools/stringspb"
    17  )
    18  
    19  // Fetch takes a RSS XML uri and fetches some of its documents.
    20  // It uses a three staged pipeline for parallel fetching.
    21  // Results are stored into the given filesystem fs.
    22  // Config points to the source of RSS XML,
    23  // and has some rules for conflating URI directories.
    24  // uriPrefix and config.DesiredNumber tell the func
    25  // which subdirs of the RSS dir should be fetched - and how many at max.
    26  func FetchUsingRSS(w http.ResponseWriter, r *http.Request,
    27  	fs fsi.FileSystem, config FetchCommand,
    28  ) {
    29  
    30  	lg, b := loghttp.BuffLoggerUniversal(w, r)
    31  	closureOverBuf := func(bUnused *bytes.Buffer) {
    32  		loghttp.Pf(w, r, b.String())
    33  	}
    34  	defer closureOverBuf(b) // the argument is ignored,
    35  
    36  	if config.Host == "" {
    37  		lg(" empty host; returning")
    38  		return
    39  	}
    40  
    41  	config = addDefaults(config)
    42  
    43  	// Fetching the rssXML takes time.
    44  	// We do it before the timouts of the pipeline stages are set off.
    45  	lg(" ")
    46  	lg(config.Host)
    47  	if config.Host == "test.economist.com" {
    48  		switchTData(w, r)
    49  	}
    50  
    51  	// lg(stringspb.IndentedDump(config))
    52  	dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true}
    53  
    54  	fnDigest := path.Join(docRoot, config.Host, "digest2.json")
    55  	loadDigest(w, r, lg, fs, fnDigest, dirTree) // previous
    56  
    57  	age := time.Now().Sub(dirTree.LastFound)
    58  	lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC))
    59  	if age.Hours() > 0.001 {
    60  
    61  		rssUrl := matchingRSSURI(w, r, config)
    62  		if rssUrl == "" {
    63  			m := new(MyWorker)
    64  			m.r = r
    65  			m.lg = lg
    66  			m.fs1 = fs
    67  			m.SURL = path.Join(config.Host, config.SearchPrefix)
    68  			_, _, _, err := fetchSave(m)
    69  			lg(err)
    70  			if err != nil {
    71  				return
    72  			}
    73  		} else {
    74  			rssUrl = path.Join(config.Host, rssUrl)
    75  			rssDoc, rssUrlObj := rssXMLFile(w, r, fs, rssUrl)
    76  			_ = rssUrlObj
    77  			rssDoc2DirTree(w, r, dirTree, rssDoc, config.Host)
    78  		}
    79  
    80  		saveDigest(lg, fs, fnDigest, dirTree)
    81  	}
    82  
    83  	// lg(dirTree.String())
    84  	//
    85  	//
    86  	// setting up a 3 staged pipeline from bottom up
    87  	//
    88  	var fullArticles []FullArticle
    89  
    90  	var inn chan *FullArticle = make(chan *FullArticle) // jobs are stuffed in here
    91  	var out chan *FullArticle = make(chan *FullArticle) // completed jobs are delivered here
    92  	var fin chan struct{} = make(chan struct{})         // downstream signals end to upstream
    93  	var stage3Wait sync.WaitGroup
    94  
    95  	// stage 3
    96  	// fire up the "collector", a fan-in
    97  	go func() {
    98  		stage3Wait.Add(1)
    99  		// 400 good value; critical point at 35
   100  		// economist.com required 800 ms
   101  		const delayInitial = 1200
   102  		const delayRefresh = 800
   103  		cout := time.After(time.Millisecond * delayInitial)
   104  		for {
   105  			select {
   106  
   107  			case fa := <-out:
   108  				fullArticles = append(fullArticles, *fa)
   109  				pth := fetch.PathFromStringUrl(fa.Url)
   110  				lg("    fetched   %v - %v ", fa.Mod.Format("15:04:05"), stringspb.Ellipsoider(pth, 50))
   111  				cout = time.After(time.Millisecond * delayRefresh) // refresh timeout
   112  			case <-cout:
   113  				lg("timeout after %v articles", len(fullArticles))
   114  				// we are using channel == nil - channel closed combinations
   115  				// inspired by http://dave.cheney.net/2013/04/30/curious-channels
   116  				out = nil // not close(out) => case above is now blocked
   117  				close(fin)
   118  				lg("fin closed; out nilled")
   119  				stage3Wait.Done()
   120  				return
   121  			}
   122  		}
   123  	}()
   124  
   125  	//
   126  	// stage 2
   127  	for i := 0; i < numWorkers; i++ {
   128  		// fire up a dedicated fetcher routine, a worker
   129  		// we are using channel == nil - channel closed combinations
   130  		// inspired by http://dave.cheney.net/2013/04/30/curious-channels
   131  		go func() {
   132  			var a *FullArticle
   133  			for {
   134  				select {
   135  				case a = <-inn:
   136  					var err error
   137  					var inf fetch.Info
   138  					a.Body, inf, err = fetch.UrlGetter(r, fetch.Options{URL: a.Url})
   139  					lg(err)
   140  					if a.Mod.IsZero() {
   141  						a.Mod = inf.Mod
   142  					}
   143  					select {
   144  					case out <- a:
   145  					case <-fin:
   146  						lg("    worker spinning down; branch 1; abandoning %v", a.Url)
   147  						return
   148  					}
   149  					a = new(FullArticle)
   150  				case <-fin:
   151  					if a != nil && a.Url != "" {
   152  						u, _ := url.Parse(a.Url)
   153  						lg("    abandoned %v", u.Path)
   154  					} else {
   155  						lg("    worker spinning down; branch 2")
   156  					}
   157  					return
   158  				}
   159  			}
   160  		}()
   161  	}
   162  
   163  	//
   164  	//
   165  	//
   166  	// loading stage 1
   167  	uriPrefix := config.SearchPrefix
   168  	found := 0
   169  	uriPrefixExcl := "impossible"
   170  	for i := 0; i < 15; i++ {
   171  		lg("  searching for prefix   %v    - excl %q    - %v of %v", uriPrefix, uriPrefixExcl, found, config.DesiredNumber)
   172  		found += stuffStage1(w, r, config, inn, fin, dirTree,
   173  			uriPrefixExcl, uriPrefix, config.DesiredNumber-found)
   174  
   175  		if found >= config.DesiredNumber {
   176  			break
   177  		}
   178  
   179  		if uriPrefix == "/" || uriPrefix == "." {
   180  			lg("  root exhausted")
   181  			break
   182  		}
   183  
   184  		newPrefix := path.Dir(uriPrefix)
   185  		uriPrefixExcl = uriPrefix
   186  		uriPrefix = newPrefix
   187  	}
   188  	lg("  found %v of %v", found, config.DesiredNumber)
   189  
   190  	//
   191  	lg("stage3Wait.Wait() before")
   192  	stage3Wait.Wait()
   193  	lg("stage3Wait.Wait() after")
   194  
   195  	// workers spin down earlier -
   196  	// but ae log writer and response writer need some time
   197  	// to record the spin-down messages
   198  	time.Sleep(120 * time.Millisecond)
   199  
   200  	// compile out directory statistics
   201  	histoDir := map[string]int{}
   202  	for _, a := range fullArticles {
   203  		u, err := url.Parse(a.Url)
   204  		lg(err)
   205  		semanticUri := condenseTrailingDir(u.Path, config.CondenseTrailingDirs)
   206  		dir := path.Dir(semanticUri)
   207  		histoDir[dir]++
   208  	}
   209  	sr := sortmap.SortMapByCount(histoDir)
   210  	_ = sr
   211  
   212  	// Create dirs
   213  	for k, _ := range histoDir {
   214  		dir := path.Join(docRoot, k) // config.Host already contained in k
   215  		err := fs.MkdirAll(dir, 0755)
   216  		lg(err)
   217  		err = fs.Chtimes(dir, time.Now(), time.Now())
   218  		lg(err)
   219  	}
   220  
   221  	// Saving as files
   222  	for _, a := range fullArticles {
   223  		if len(a.Body) == 0 {
   224  			continue
   225  		}
   226  		u, err := url.Parse(a.Url)
   227  		u.Fragment = ""
   228  		u.RawQuery = ""
   229  		lg(err)
   230  		semanticUri := condenseTrailingDir(u.RequestURI(), config.CondenseTrailingDirs)
   231  		p := path.Join(docRoot, semanticUri)
   232  		err = fs.WriteFile(p, a.Body, 0644)
   233  		lg(err)
   234  		err = fs.Chtimes(p, a.Mod, a.Mod)
   235  		lg(err)
   236  	}
   237  
   238  	{
   239  		b, err := json.MarshalIndent(histoDir, "  ", "\t")
   240  		lg(err)
   241  		fnDigest := path.Join(docRoot, config.Host, "fetchDigest.json")
   242  		err = fs.WriteFile(fnDigest, b, 0755)
   243  		lg(err)
   244  	}
   245  
   246  	// fsm, ok := memfs.Unwrap(fs)
   247  	// if ok {
   248  	// 	fsm.Dump()
   249  	// }
   250  
   251  }
   252  
   253  // stuffStage1 ranges over the RSS entries and filters out unwanted directories.
   254  // Wanted urls are sent to the stage one channel.
   255  func stuffStage1(w http.ResponseWriter, r *http.Request, config FetchCommand,
   256  	inn chan *FullArticle, fin chan struct{}, dirTree *DirTree,
   257  	uriPrefixExcl, uriPrefixIncl string, nWant int) (nFound int) {
   258  
   259  	lg, lge := loghttp.Logger(w, r)
   260  	_ = lge
   261  
   262  	subtree, head := DiveToDeepestMatch(dirTree, uriPrefixIncl)
   263  
   264  	if subtree == nil {
   265  		lg("      does not exist in dirtree: %q", uriPrefixIncl)
   266  	} else {
   267  
   268  		opt := LevelWiseDeeperOptions{}
   269  		opt.Rump = head
   270  		opt.ExcludeDir = uriPrefixExcl
   271  		opt.MaxDepthDiff = config.DepthTolerance
   272  		opt.CondenseTrailingDirs = config.CondenseTrailingDirs
   273  		opt.MaxNumber = nWant
   274  		articles := LevelWiseDeeper(w, r, subtree, opt)
   275  		// lg("      levelwise deeper found %v articles", len(articles))
   276  
   277  		for _, art := range articles {
   278  
   279  			lg("    feed #%02v: %v - %v", nFound, art.Mod.Format("15:04:05"), stringspb.Ellipsoider(art.Url, 50))
   280  
   281  			art.Url = config.Host + art.Url
   282  
   283  			select {
   284  			case inn <- &art:
   285  				// stage 1 loading
   286  			case <-fin:
   287  				lg("downstream stage has shut down, stop stuffing stage1")
   288  				return
   289  			}
   290  
   291  			nFound++
   292  			if nFound > nWant-1 {
   293  				return
   294  			}
   295  
   296  		}
   297  
   298  	}
   299  
   300  	return
   301  
   302  }