github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/repo/8_fetch_similar.go (about) 1 package repo 2 3 import ( 4 "bytes" 5 "encoding/json" 6 "io/ioutil" 7 "net/http" 8 "path" 9 "strconv" 10 "strings" 11 "time" 12 13 "github.com/pbberlin/tools/distrib" 14 "github.com/pbberlin/tools/net/http/fetch" 15 "github.com/pbberlin/tools/net/http/loghttp" 16 "github.com/pbberlin/tools/net/http/routes" 17 "github.com/pbberlin/tools/net/http/tplx" 18 "github.com/pbberlin/tools/os/fsi" 19 "github.com/pbberlin/tools/stringspb" 20 "google.golang.org/appengine" 21 ) 22 23 type MyWorker struct { 24 SURL string 25 Protocol string 26 27 r *http.Request 28 29 lg loghttp.FuncBufUniv 30 31 fs1 fsi.FileSystem 32 33 err error 34 FA *FullArticle 35 } 36 37 func (m *MyWorker) Work() { 38 39 bts, mod, _, err := fetchSave(m) 40 41 if err != nil { 42 m.err = err 43 return 44 } 45 46 if len(bts) < 200 { 47 if bytes.Contains(bts, []byte(fetch.MsgNoRdirects)) { 48 return 49 } 50 } 51 52 m.FA = &FullArticle{} 53 m.FA.Mod = mod 54 m.FA.Body = bts 55 m.FA.Url = m.SURL 56 57 } 58 59 // FetchSimilar is an extended version of Fetch 60 // It is uses a DirTree of crawled *links*, not actual files. 61 // As it moves up the DOM, it crawls every document for additional links. 62 // It first moves up to find similar URLs on the same depth 63 // /\ 64 // /\ / \ 65 // /\ / \ / \ 66 // It then moves up the ladder again - to accept higher URLs 67 // /\ 68 // /\ 69 // /\ 70 func FetchSimilar(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { 71 72 lg, b := loghttp.BuffLoggerUniversal(w, r) 73 closureOverBuf := func(bUnused *bytes.Buffer) { 74 loghttp.Pf(w, r, b.String()) 75 } 76 defer closureOverBuf(b) // the argument is ignored, 77 78 r.Header.Set("X-Custom-Header-Counter", "nocounter") 79 80 start := time.Now() 81 82 wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Find similar HTML URLs"})) 83 defer wpf(b, tplx.Foot) 84 85 wpf(b, "<pre>") 86 defer wpf(b, "</pre>") 87 88 fs1 := GetFS(appengine.NewContext(r)) 89 90 err := r.ParseForm() 91 lg(err) 92 93 countSimilar := 3 94 sCountSimilar := r.FormValue("cnt") 95 if sCountSimilar != "" { 96 i, err := strconv.Atoi(strings.TrimSpace(sCountSimilar)) 97 if err == nil { 98 countSimilar = i 99 } 100 } 101 102 surl := r.FormValue(routes.URLParamKey) 103 ourl, err := fetch.URLFromString(surl) 104 lg(err) 105 if err != nil { 106 return 107 } 108 if ourl.Host == "" { 109 lg("host is empty (%v)", surl) 110 return 111 } 112 113 knownProtocol := "" 114 if r.FormValue("prot") != "" { 115 knownProtocol = r.FormValue("prot") 116 } 117 118 numWorkers := 0 119 sNumWorkers := r.FormValue("numworkers") 120 if sNumWorkers != "" { 121 i, err := strconv.Atoi(strings.TrimSpace(sNumWorkers)) 122 if err == nil { 123 numWorkers = i 124 } 125 } 126 127 srcDepth := strings.Count(ourl.Path, "/") 128 129 cmd := FetchCommand{} 130 cmd.Host = ourl.Host 131 cmd.SearchPrefix = ourl.Path 132 cmd = addDefaults(cmd) 133 134 dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} 135 fnDigest := path.Join(docRoot, cmd.Host, "digest2.json") 136 loadDigest(w, r, lg, fs1, fnDigest, dirTree) // previous 137 lg("dirtree 400 chars is %v end of dirtree\t\t", stringspb.ToLen(dirTree.String(), 400)) 138 139 m1 := new(MyWorker) 140 m1.r = r 141 m1.lg = lg 142 m1.fs1 = fs1 143 m1.SURL = path.Join(cmd.Host, ourl.Path) 144 m1.Protocol = knownProtocol 145 btsSrc, modSrc, usedExisting, err := fetchSave(m1) 146 if !usedExisting { 147 addAnchors(lg, cmd.Host, btsSrc, dirTree) 148 } 149 lg(err) 150 if err != nil { 151 return 152 } 153 154 lg("\t\t%4.2v secs so far 1", time.Now().Sub(start).Seconds()) 155 156 var treePath string 157 treePath = "/blogs/freeexchange" 158 treePath = "/news/europe" 159 treePath = path.Dir(ourl.Path) 160 161 opt := LevelWiseDeeperOptions{} 162 opt.Rump = treePath 163 opt.ExcludeDir = "/news/americas" 164 opt.ExcludeDir = "/blogs/buttonwood" 165 opt.ExcludeDir = "/something-impossible" 166 opt.MinDepthDiff = 1 167 opt.MaxDepthDiff = 1 168 opt.CondenseTrailingDirs = cmd.CondenseTrailingDirs 169 opt.MaxNumber = cmd.DesiredNumber + 1 // one more for "self" 170 opt.MaxNumber = cmd.DesiredNumber + 40 // collect more, 'cause we filter out those too old later 171 172 var subtree *DirTree 173 links := []FullArticle{} 174 175 alreadyCrawled := map[string]struct{}{} 176 177 MarkOuter: 178 for j := 0; j < srcDepth; j++ { 179 treePath = path.Dir(ourl.Path) 180 MarkInner: 181 // for i := 1; i < srcDepth; i++ { 182 for i := 1; i < (srcDepth + 5); i++ { 183 184 subtree, treePath = DiveToDeepestMatch(dirTree, treePath) 185 186 lg("Looking from height %v to level %v - %v", srcDepth-i, srcDepth-j, treePath) 187 188 if _, ok := alreadyCrawled[treePath]; ok { 189 // lg("\t already digested %v", treePath) 190 continue 191 } 192 193 m2 := new(MyWorker) 194 m2.r = r 195 m2.lg = lg 196 m2.fs1 = fs1 197 m2.SURL = path.Join(cmd.Host, treePath) 198 m2.Protocol = knownProtocol 199 200 btsPar, _, usedExisting, err := fetchSave(m2) 201 lg(err) 202 if err != nil { 203 return 204 } 205 alreadyCrawled[treePath] = struct{}{} 206 if !usedExisting { 207 addAnchors(lg, cmd.Host, btsPar, dirTree) 208 } 209 210 if subtree == nil { 211 lg("\n#%v treePath %q ; subtree is nil", i, treePath) 212 } else { 213 // lg("\n#%v treePath %q ; subtree exists", i, treePath) 214 215 opt.Rump = treePath 216 opt.MinDepthDiff = i - j 217 opt.MaxDepthDiff = i - j 218 lvlLinks := LevelWiseDeeper(nil, nil, subtree, opt) 219 links = append(links, lvlLinks...) 220 for _, art := range lvlLinks { 221 _ = art 222 // lg("#%v fnd %v", i, stringspb.ToLen(art.Url, 100)) 223 } 224 225 if len(links) >= opt.MaxNumber { 226 lg("found enough links") 227 break MarkOuter 228 } 229 230 pathPrev := treePath 231 treePath = path.Dir(treePath) 232 // lg("#%v bef %v - aft %v", i, pathPrev, treePath) 233 234 if pathPrev == "." && treePath == "." || 235 pathPrev == "/" && treePath == "/" || 236 pathPrev == "" && treePath == "." { 237 lg("break to innner") 238 break MarkInner 239 } 240 } 241 242 } 243 } 244 245 // 246 // 247 // 248 // 249 lg("%v links after %4.2v secs", len(links), time.Now().Sub(start).Seconds()) 250 251 lg("============================") 252 lg("Now reading/fetching actual similar files - not just the links") 253 // 254 tried := 0 255 selecteds := []FullArticle{} 256 257 nonExisting := []FullArticle{} 258 nonExistFetched := []FullArticle{} 259 260 for _, art := range links { 261 262 if art.Url == ourl.Path { 263 lg("skipping self\t%v", art.Url) 264 continue 265 } 266 267 tried++ 268 269 useExisting := false 270 271 semanticUri := condenseTrailingDir(art.Url, cmd.CondenseTrailingDirs) 272 p := path.Join(docRoot, cmd.Host, semanticUri) 273 274 f, err := fs1.Open(p) 275 // lg(err) // its no error if file does not exist 276 if err != nil { 277 // lg("!nstore %q", semanticUri) 278 } else { 279 // lg("reading %q", semanticUri) 280 281 // lets put this into a func, so that f.close it called at the end of this func 282 // otherwise defer f.close() spans the entire func and prevents 283 // overwrites chmods further down 284 f := func() { 285 defer f.Close() 286 fi, err := f.Stat() 287 lg(err) 288 if err != nil { 289 290 } else { 291 age := time.Now().Sub(fi.ModTime()) 292 if age.Hours() < 10 { 293 lg("\t\tusing existing file with age %4.2v hrs", age.Hours()) 294 art.Mod = fi.ModTime() 295 bts, err := ioutil.ReadAll(f) 296 lg(err) 297 art.Body = bts 298 if len(bts) < 200 { 299 if bytes.Contains(bts, []byte(fetch.MsgNoRdirects)) { 300 return 301 } 302 } 303 selecteds = append(selecteds, art) 304 useExisting = true 305 } 306 } 307 } 308 f() 309 310 } 311 312 if !useExisting { 313 nonExisting = append(nonExisting, art) 314 } 315 316 if len(selecteds) >= countSimilar { 317 break 318 } 319 320 } 321 lg("============================") 322 lg("tried %v links - yielding %v existing similars; not existing in datastore: %v, %v were requested.", 323 tried, len(selecteds), len(nonExisting), countSimilar) 324 325 if len(selecteds) < countSimilar { 326 jobs := make([]distrib.Worker, 0, len(nonExisting)) 327 for _, art := range nonExisting { 328 surl := path.Join(cmd.Host, art.Url) 329 wrkr := MyWorker{SURL: surl} 330 wrkr.Protocol = knownProtocol 331 wrkr.r = r 332 wrkr.lg = lg 333 wrkr.fs1 = fs1 334 job := distrib.Worker(&wrkr) 335 jobs = append(jobs, job) 336 } 337 338 opt := distrib.NewDefaultOptions() 339 opt.TimeOutDur = 3500 * time.Millisecond 340 opt.Want = int32(countSimilar - len(selecteds) + 4) // get some more, in case we have "redirected" bodies 341 opt.NumWorkers = int(opt.Want) // 5s query limit; => hurry; spawn as many as we want 342 if numWorkers > 0 { 343 opt.NumWorkers = numWorkers 344 } 345 lg("Preparing %v simultaneous, wanting %v fetches; at %4.2v secs.", opt.NumWorkers, opt.Want, time.Now().Sub(start).Seconds()) 346 opt.CollectRemainder = false // 5s query limit; => hurry; dont wait for stragglers 347 348 ret, msg := distrib.Distrib(jobs, opt) 349 lg("Distrib returned at %4.2v secs with %v results.", time.Now().Sub(start).Seconds(), len(ret)) 350 351 lg("\n" + msg.String()) 352 for _, v := range ret { 353 v1, _ := v.Worker.(*MyWorker) 354 if v1.FA != nil { 355 age := time.Now().Sub(v1.FA.Mod) 356 if age.Hours() < 10 { 357 lg("\t\tusing fetched file with age %4.2v hrs", age.Hours()) 358 nonExistFetched = append(nonExistFetched, *v1.FA) 359 if len(nonExistFetched) > (countSimilar - len(selecteds)) { 360 break 361 } 362 } 363 } 364 if v1.err != nil { 365 lg(err) 366 } 367 } 368 369 lg("tried %v links - yielding %v fetched - jobs %v", len(nonExisting), len(nonExistFetched), len(jobs)) 370 selecteds = append(selecteds, nonExistFetched...) 371 372 // 373 // 374 // Extract links 375 for _, v := range nonExistFetched { 376 // lg("links -> memory dirtree for %q", v.Url) 377 addAnchors(lg, cmd.Host, v.Body, dirTree) 378 } 379 380 } 381 382 // 383 if time.Now().Sub(dirTree.LastFound).Seconds() < 10 { 384 lg("saving accumulated (new) links to digest") 385 saveDigest(lg, fs1, fnDigest, dirTree) 386 } 387 388 lg("\t\t%4.2v secs so far 3", time.Now().Sub(start).Seconds()) 389 390 mp := map[string][]byte{} 391 mp["msg"] = b.Bytes() 392 mp["url_self"] = []byte(condenseTrailingDir(ourl.Path, cmd.CondenseTrailingDirs)) 393 mp["mod_self"] = []byte(modSrc.Format(http.TimeFormat)) 394 mp["bod_self"] = btsSrc 395 396 for i, v := range selecteds { 397 mp["url__"+spf("%02v", i)] = []byte(v.Url) 398 mp["mod__"+spf("%02v", i)] = []byte(v.Mod.Format(http.TimeFormat)) 399 mp["bod__"+spf("%02v", i)] = v.Body 400 } 401 402 mp["lensimilar"] = []byte(spf("%02v", len(selecteds))) 403 404 // 405 smp, err := json.MarshalIndent(mp, "", "\t") 406 if err != nil { 407 lg(b, "marshalling mp to []byte failed\n") 408 return 409 } 410 411 r.Header.Set("X-Custom-Header-Counter", "nocounter") 412 w.Header().Set("Content-Type", "application/json") 413 w.Write(smp) 414 415 b.Reset() // this keeps the buf pointer intact; outgoing defers are still heeded 416 b = new(bytes.Buffer) // creates a *new* buf pointer; outgoing defers write into the *old* buf 417 418 lg("\t\t%4.2v secs so far 4 (json resp written as []byte)", time.Now().Sub(start).Seconds()) 419 420 return 421 422 }