github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/wpjsontool/cache.go (about) 1 package main 2 3 import ( 4 "github.com/bcampbell/warc" 5 "github.com/flytam/filenamify" 6 "net/http" 7 "os" 8 "path/filepath" 9 "time" 10 ) 11 12 // HTTPGetWithCache performs a GET, using files in cacheDir to cache requests. 13 // If cacheDir is "", don't bother caching. 14 func HTTPGetWithCache(client *http.Client, u string, cacheDir string) (*http.Response, error) { 15 16 // passthru if we're not using a cache at all 17 if cacheDir == "" { 18 return client.Get(u) 19 } 20 err := os.MkdirAll(cacheDir, os.ModePerm) 21 if err != nil { 22 return nil, err 23 } 24 25 // note: filenamify default length is 100 which is waaaaay too short for us. 26 safeName, err := filenamify.Filenamify(u, filenamify.Options{MaxLength: 250}) 27 cacheName := filepath.Join(cacheDir, safeName) 28 29 resp, err := warc.ReadFile(cacheName) 30 if err != nil { 31 if os.IsNotExist(err) { 32 // not in cache - perform a real http request 33 resp, err = client.Get(u) 34 if err != nil { 35 return nil, err 36 } 37 cache := false 38 // Cache 2xx, 3xx and 4xx responses 39 if resp.StatusCode >= 200 && resp.StatusCode < 300 { 40 cache = true 41 } 42 if resp.StatusCode >= 300 && resp.StatusCode < 400 { 43 cache = true 44 } 45 if resp.StatusCode >= 400 && resp.StatusCode < 500 { 46 cache = true 47 } 48 if cache { 49 // success. write to cache. 50 out, err := os.Create(cacheName) 51 if err != nil { 52 return nil, err 53 } 54 err = warc.Write(out, resp, u, time.Now()) 55 if err != nil { 56 return nil, err 57 } 58 } 59 } 60 } 61 return resp, nil 62 }