github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/repo/4_fetch.go (about) 1 package repo 2 3 import ( 4 "bytes" 5 "encoding/json" 6 "net/http" 7 "net/url" 8 "path" 9 "sync" 10 "time" 11 12 "github.com/pbberlin/tools/net/http/fetch" 13 "github.com/pbberlin/tools/net/http/loghttp" 14 "github.com/pbberlin/tools/os/fsi" 15 "github.com/pbberlin/tools/sort/sortmap" 16 "github.com/pbberlin/tools/stringspb" 17 ) 18 19 // Fetch takes a RSS XML uri and fetches some of its documents. 20 // It uses a three staged pipeline for parallel fetching. 21 // Results are stored into the given filesystem fs. 22 // Config points to the source of RSS XML, 23 // and has some rules for conflating URI directories. 24 // uriPrefix and config.DesiredNumber tell the func 25 // which subdirs of the RSS dir should be fetched - and how many at max. 26 func FetchUsingRSS(w http.ResponseWriter, r *http.Request, 27 fs fsi.FileSystem, config FetchCommand, 28 ) { 29 30 lg, b := loghttp.BuffLoggerUniversal(w, r) 31 closureOverBuf := func(bUnused *bytes.Buffer) { 32 loghttp.Pf(w, r, b.String()) 33 } 34 defer closureOverBuf(b) // the argument is ignored, 35 36 if config.Host == "" { 37 lg(" empty host; returning") 38 return 39 } 40 41 config = addDefaults(config) 42 43 // Fetching the rssXML takes time. 44 // We do it before the timouts of the pipeline stages are set off. 45 lg(" ") 46 lg(config.Host) 47 if config.Host == "test.economist.com" { 48 switchTData(w, r) 49 } 50 51 // lg(stringspb.IndentedDump(config)) 52 dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} 53 54 fnDigest := path.Join(docRoot, config.Host, "digest2.json") 55 loadDigest(w, r, lg, fs, fnDigest, dirTree) // previous 56 57 age := time.Now().Sub(dirTree.LastFound) 58 lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC)) 59 if age.Hours() > 0.001 { 60 61 rssUrl := matchingRSSURI(w, r, config) 62 if rssUrl == "" { 63 m := new(MyWorker) 64 m.r = r 65 m.lg = lg 66 m.fs1 = fs 67 m.SURL = path.Join(config.Host, config.SearchPrefix) 68 _, _, _, err := fetchSave(m) 69 lg(err) 70 if err != nil { 71 return 72 } 73 } else { 74 rssUrl = path.Join(config.Host, rssUrl) 75 rssDoc, rssUrlObj := rssXMLFile(w, r, fs, rssUrl) 76 _ = rssUrlObj 77 rssDoc2DirTree(w, r, dirTree, rssDoc, config.Host) 78 } 79 80 saveDigest(lg, fs, fnDigest, dirTree) 81 } 82 83 // lg(dirTree.String()) 84 // 85 // 86 // setting up a 3 staged pipeline from bottom up 87 // 88 var fullArticles []FullArticle 89 90 var inn chan *FullArticle = make(chan *FullArticle) // jobs are stuffed in here 91 var out chan *FullArticle = make(chan *FullArticle) // completed jobs are delivered here 92 var fin chan struct{} = make(chan struct{}) // downstream signals end to upstream 93 var stage3Wait sync.WaitGroup 94 95 // stage 3 96 // fire up the "collector", a fan-in 97 go func() { 98 stage3Wait.Add(1) 99 // 400 good value; critical point at 35 100 // economist.com required 800 ms 101 const delayInitial = 1200 102 const delayRefresh = 800 103 cout := time.After(time.Millisecond * delayInitial) 104 for { 105 select { 106 107 case fa := <-out: 108 fullArticles = append(fullArticles, *fa) 109 pth := fetch.PathFromStringUrl(fa.Url) 110 lg(" fetched %v - %v ", fa.Mod.Format("15:04:05"), stringspb.Ellipsoider(pth, 50)) 111 cout = time.After(time.Millisecond * delayRefresh) // refresh timeout 112 case <-cout: 113 lg("timeout after %v articles", len(fullArticles)) 114 // we are using channel == nil - channel closed combinations 115 // inspired by http://dave.cheney.net/2013/04/30/curious-channels 116 out = nil // not close(out) => case above is now blocked 117 close(fin) 118 lg("fin closed; out nilled") 119 stage3Wait.Done() 120 return 121 } 122 } 123 }() 124 125 // 126 // stage 2 127 for i := 0; i < numWorkers; i++ { 128 // fire up a dedicated fetcher routine, a worker 129 // we are using channel == nil - channel closed combinations 130 // inspired by http://dave.cheney.net/2013/04/30/curious-channels 131 go func() { 132 var a *FullArticle 133 for { 134 select { 135 case a = <-inn: 136 var err error 137 var inf fetch.Info 138 a.Body, inf, err = fetch.UrlGetter(r, fetch.Options{URL: a.Url}) 139 lg(err) 140 if a.Mod.IsZero() { 141 a.Mod = inf.Mod 142 } 143 select { 144 case out <- a: 145 case <-fin: 146 lg(" worker spinning down; branch 1; abandoning %v", a.Url) 147 return 148 } 149 a = new(FullArticle) 150 case <-fin: 151 if a != nil && a.Url != "" { 152 u, _ := url.Parse(a.Url) 153 lg(" abandoned %v", u.Path) 154 } else { 155 lg(" worker spinning down; branch 2") 156 } 157 return 158 } 159 } 160 }() 161 } 162 163 // 164 // 165 // 166 // loading stage 1 167 uriPrefix := config.SearchPrefix 168 found := 0 169 uriPrefixExcl := "impossible" 170 for i := 0; i < 15; i++ { 171 lg(" searching for prefix %v - excl %q - %v of %v", uriPrefix, uriPrefixExcl, found, config.DesiredNumber) 172 found += stuffStage1(w, r, config, inn, fin, dirTree, 173 uriPrefixExcl, uriPrefix, config.DesiredNumber-found) 174 175 if found >= config.DesiredNumber { 176 break 177 } 178 179 if uriPrefix == "/" || uriPrefix == "." { 180 lg(" root exhausted") 181 break 182 } 183 184 newPrefix := path.Dir(uriPrefix) 185 uriPrefixExcl = uriPrefix 186 uriPrefix = newPrefix 187 } 188 lg(" found %v of %v", found, config.DesiredNumber) 189 190 // 191 lg("stage3Wait.Wait() before") 192 stage3Wait.Wait() 193 lg("stage3Wait.Wait() after") 194 195 // workers spin down earlier - 196 // but ae log writer and response writer need some time 197 // to record the spin-down messages 198 time.Sleep(120 * time.Millisecond) 199 200 // compile out directory statistics 201 histoDir := map[string]int{} 202 for _, a := range fullArticles { 203 u, err := url.Parse(a.Url) 204 lg(err) 205 semanticUri := condenseTrailingDir(u.Path, config.CondenseTrailingDirs) 206 dir := path.Dir(semanticUri) 207 histoDir[dir]++ 208 } 209 sr := sortmap.SortMapByCount(histoDir) 210 _ = sr 211 212 // Create dirs 213 for k, _ := range histoDir { 214 dir := path.Join(docRoot, k) // config.Host already contained in k 215 err := fs.MkdirAll(dir, 0755) 216 lg(err) 217 err = fs.Chtimes(dir, time.Now(), time.Now()) 218 lg(err) 219 } 220 221 // Saving as files 222 for _, a := range fullArticles { 223 if len(a.Body) == 0 { 224 continue 225 } 226 u, err := url.Parse(a.Url) 227 u.Fragment = "" 228 u.RawQuery = "" 229 lg(err) 230 semanticUri := condenseTrailingDir(u.RequestURI(), config.CondenseTrailingDirs) 231 p := path.Join(docRoot, semanticUri) 232 err = fs.WriteFile(p, a.Body, 0644) 233 lg(err) 234 err = fs.Chtimes(p, a.Mod, a.Mod) 235 lg(err) 236 } 237 238 { 239 b, err := json.MarshalIndent(histoDir, " ", "\t") 240 lg(err) 241 fnDigest := path.Join(docRoot, config.Host, "fetchDigest.json") 242 err = fs.WriteFile(fnDigest, b, 0755) 243 lg(err) 244 } 245 246 // fsm, ok := memfs.Unwrap(fs) 247 // if ok { 248 // fsm.Dump() 249 // } 250 251 } 252 253 // stuffStage1 ranges over the RSS entries and filters out unwanted directories. 254 // Wanted urls are sent to the stage one channel. 255 func stuffStage1(w http.ResponseWriter, r *http.Request, config FetchCommand, 256 inn chan *FullArticle, fin chan struct{}, dirTree *DirTree, 257 uriPrefixExcl, uriPrefixIncl string, nWant int) (nFound int) { 258 259 lg, lge := loghttp.Logger(w, r) 260 _ = lge 261 262 subtree, head := DiveToDeepestMatch(dirTree, uriPrefixIncl) 263 264 if subtree == nil { 265 lg(" does not exist in dirtree: %q", uriPrefixIncl) 266 } else { 267 268 opt := LevelWiseDeeperOptions{} 269 opt.Rump = head 270 opt.ExcludeDir = uriPrefixExcl 271 opt.MaxDepthDiff = config.DepthTolerance 272 opt.CondenseTrailingDirs = config.CondenseTrailingDirs 273 opt.MaxNumber = nWant 274 articles := LevelWiseDeeper(w, r, subtree, opt) 275 // lg(" levelwise deeper found %v articles", len(articles)) 276 277 for _, art := range articles { 278 279 lg(" feed #%02v: %v - %v", nFound, art.Mod.Format("15:04:05"), stringspb.Ellipsoider(art.Url, 50)) 280 281 art.Url = config.Host + art.Url 282 283 select { 284 case inn <- &art: 285 // stage 1 loading 286 case <-fin: 287 lg("downstream stage has shut down, stop stuffing stage1") 288 return 289 } 290 291 nFound++ 292 if nFound > nWant-1 { 293 return 294 } 295 296 } 297 298 } 299 300 return 301 302 }