github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/wpjsontool/cache.go (about)

     1  package main
     2  
     3  import (
     4  	"github.com/bcampbell/warc"
     5  	"github.com/flytam/filenamify"
     6  	"net/http"
     7  	"os"
     8  	"path/filepath"
     9  	"time"
    10  )
    11  
    12  // HTTPGetWithCache performs a GET, using files in cacheDir to cache requests.
    13  // If cacheDir is "", don't bother caching.
    14  func HTTPGetWithCache(client *http.Client, u string, cacheDir string) (*http.Response, error) {
    15  
    16  	// passthru if we're not using a cache at all
    17  	if cacheDir == "" {
    18  		return client.Get(u)
    19  	}
    20  	err := os.MkdirAll(cacheDir, os.ModePerm)
    21  	if err != nil {
    22  		return nil, err
    23  	}
    24  
    25  	// note: filenamify default length is 100 which is waaaaay too short for us.
    26  	safeName, err := filenamify.Filenamify(u, filenamify.Options{MaxLength: 250})
    27  	cacheName := filepath.Join(cacheDir, safeName)
    28  
    29  	resp, err := warc.ReadFile(cacheName)
    30  	if err != nil {
    31  		if os.IsNotExist(err) {
    32  			// not in cache - perform a real http request
    33  			resp, err = client.Get(u)
    34  			if err != nil {
    35  				return nil, err
    36  			}
    37  			cache := false
    38  			// Cache 2xx, 3xx and 4xx responses
    39  			if resp.StatusCode >= 200 && resp.StatusCode < 300 {
    40  				cache = true
    41  			}
    42  			if resp.StatusCode >= 300 && resp.StatusCode < 400 {
    43  				cache = true
    44  			}
    45  			if resp.StatusCode >= 400 && resp.StatusCode < 500 {
    46  				cache = true
    47  			}
    48  			if cache {
    49  				// success. write to cache.
    50  				out, err := os.Create(cacheName)
    51  				if err != nil {
    52  					return nil, err
    53  				}
    54  				err = warc.Write(out, resp, u, time.Now())
    55  				if err != nil {
    56  					return nil, err
    57  				}
    58  			}
    59  		}
    60  	}
    61  	return resp, nil
    62  }