github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/repo/6_dir_digest_3.go (about) 1 package repo 2 3 import ( 4 "bytes" 5 "fmt" 6 "io/ioutil" 7 "net/http" 8 "path" 9 "time" 10 11 "github.com/pbberlin/tools/appengine/util_appengine" 12 "github.com/pbberlin/tools/net/http/fetch" 13 "github.com/pbberlin/tools/net/http/loghttp" 14 "golang.org/x/net/context" 15 "golang.org/x/net/html" 16 ) 17 18 // Fetches URL if local file is outdated. 19 // saves fetched file 20 // 21 // link extraction, link addition to treeX now accumulated one level higher 22 // bool return value: use existing => true 23 func fetchSave(m *MyWorker) ([]byte, time.Time, bool, error) { 24 25 // w http.ResponseWriter, 26 // r *http.Request, 27 28 // Determine FileName 29 ourl, err := fetch.URLFromString(m.SURL) 30 fc := FetchCommand{} 31 fc.Host = ourl.Host 32 fc = addDefaults(fc) 33 semanticUri := condenseTrailingDir(m.SURL, fc.CondenseTrailingDirs) 34 fn := path.Join(docRoot, semanticUri) 35 36 m.lg("crawlin %q", m.SURL) 37 38 // File already exists? 39 // Open file for age check 40 var bts []byte 41 var mod time.Time 42 f := func() error { 43 file1, err := m.fs1.Open(fn) 44 // m.lg(err) // file may simply not exist 45 if err != nil { 46 return err // file may simply not exist 47 } 48 defer file1.Close() // file close *fast* at the end of *this* anonymous func 49 50 fi, err := file1.Stat() 51 m.lg(err) 52 if err != nil { 53 return err 54 } 55 56 if fi.IsDir() { 57 m.lg("\t\t file is a directory, skipping - %v", fn) 58 return fmt.Errorf("is directory: %v", fn) 59 } 60 61 mod = fi.ModTime() 62 age := time.Now().Sub(mod) 63 if age.Hours() > 10 { 64 m.lg("\t\t file %4.2v hours old, refetch ", age.Hours()) 65 return fmt.Errorf("too old: %v", fn) 66 } 67 68 m.lg("\t\t file only %4.2v hours old, take %4.2vkB from datastore", age.Hours(), fi.Size()/1024) 69 bts, err = ioutil.ReadAll(file1) 70 if err != nil { 71 return err 72 } 73 return nil 74 } 75 76 err = f() 77 if err == nil { 78 return bts, mod, true, err 79 } 80 81 // 82 // Fetch 83 bts, inf, err := fetch.UrlGetter(m.r, fetch.Options{URL: m.SURL, KnownProtocol: m.Protocol, RedirectHandling: 1}) 84 m.lg(err) 85 if err != nil { 86 if inf.Status != http.StatusNotFound { 87 m.lg("tried to fetch %v, %v", m.SURL, inf.URL) 88 m.lg("msg %v", inf.Msg) 89 return []byte{}, inf.Mod, false, err 90 } 91 // In our traversing upwards, we might encounter "directory links" that have no index.html. 92 // For a *derived* URL, this is no error. 93 bts = []byte(" ... not found ... ") 94 } 95 if inf.Mod.IsZero() { 96 inf.Mod = time.Now().Add(-75 * time.Minute) 97 } 98 99 // 100 // 101 // main request still exists? 102 if false { 103 var cx context.Context 104 cx = util_appengine.SafelyExtractGaeContext(m.r) 105 if cx == nil { 106 m.lg("timed out - returning") 107 return bts, inf.Mod, false, fmt.Errorf("req timed out") 108 } 109 } 110 111 m.lg("retrivd+saved %q; %vkB ", inf.URL.Host+inf.URL.Path, len(bts)/1024) 112 113 if len(bts) > 1024*1024-1 { 114 bts = removeScriptsAndComments(m.lg, bts) 115 m.lg("size reduced_1 to %vkB ", len(bts)/1024) 116 117 // if len(bts) > 1024*1024-1 { 118 // bts = snappy.Encode(nil, bts) 119 // fn = strings.Replace(fn, ".html", ".snap.html", -1) 120 // m.lg("size reduced_2 to %vkB ", len(bts)/1024) 121 // } 122 } 123 124 // 125 // 126 dir := path.Dir(fn) 127 err = m.fs1.MkdirAll(dir, 0755) 128 m.lg(err) 129 err = m.fs1.Chtimes(dir, time.Now(), time.Now()) 130 m.lg(err) 131 err = m.fs1.WriteFile(fn, bts, 0644) 132 m.lg(err) 133 err = m.fs1.Chtimes(fn, inf.Mod, inf.Mod) 134 m.lg(err) 135 136 return bts, inf.Mod, false, nil 137 138 } 139 140 func removeScriptsAndComments(lg loghttp.FuncBufUniv, bts []byte) []byte { 141 doc, err := html.Parse(bytes.NewReader(bts)) 142 lg(err) 143 if err != nil { 144 return []byte{} 145 } 146 var fr func(*html.Node) // function recursive 147 fr = func(n *html.Node) { 148 for c := n.FirstChild; c != nil; c = c.NextSibling { 149 fr(c) 150 } 151 removeUnwanted(n) 152 153 } 154 fr(doc) 155 var b bytes.Buffer 156 err = html.Render(&b, doc) 157 return b.Bytes() 158 } 159 160 func removeUnwanted(n *html.Node) { 161 cc := []*html.Node{} 162 for c := n.FirstChild; c != nil; c = c.NextSibling { 163 cc = append(cc, c) 164 } 165 for _, c := range cc { 166 if n.Type == html.ElementNode && n.Data == "script" || n.Type == html.CommentNode { 167 n.RemoveChild(c) 168 } 169 } 170 }