github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/arc/warc.go (about)

     1  package arc
     2  
     3  // helpers to write out raw HTTP requests/responses to noddy .warc files
     4  
     5  import (
     6  	"compress/gzip"
     7  	"crypto/md5"
     8  	"encoding/hex"
     9  	"github.com/bcampbell/warc"
    10  	"net/http"
    11  	"net/url"
    12  	"os"
    13  	"path"
    14  	"time"
    15  )
    16  
    17  // eg "abcdefg.foo" returns "a/ab/acb"
    18  func spreadPath(name string) string {
    19  	numChunks := 3 // how many subdirs to use
    20  	chunkSize := 1 // num chars per subdir
    21  
    22  	if len(name) < numChunks*chunkSize {
    23  		panic("name too short")
    24  	}
    25  
    26  	parts := make([]string, numChunks)
    27  	for chunk := 0; chunk < numChunks; chunk++ {
    28  		parts[chunk] = name[0 : (chunk+1)*chunkSize]
    29  	}
    30  	return path.Join(parts...)
    31  }
    32  
    33  /*
    34  func AlreadyGot(warcDir, srcURL string) bool,error {
    35  	u, err := url.Parse(srcURL)
    36  	if err != nil {
    37  		return err
    38  	}
    39  	hasher := md5.New()
    40  	hasher.Write([]byte(srcURL))
    41  	filename := hex.EncodeToString(hasher.Sum(nil)) + ".warc"
    42  	dir := path.Join(warcDir, u.Host, spreadPath(filename))
    43      full := path.Join(dir, filename)
    44  }
    45  */
    46  
    47  func ArchiveResponse(warcDir string, resp *http.Response, srcURL string, timeStamp time.Time) error {
    48  
    49  	u, err := url.Parse(srcURL)
    50  	if err != nil {
    51  		return err
    52  	}
    53  
    54  	hasher := md5.New()
    55  	hasher.Write([]byte(srcURL))
    56  	filename := hex.EncodeToString(hasher.Sum(nil)) + ".warc.gz"
    57  
    58  	//dir := path.Join(warcDir, u.Host, timeStamp.UTC().Format("2006-01-02"))
    59  
    60  	// .../www.example.com/1/12/123/12345678.warc
    61  	dir := path.Join(warcDir, u.Host, spreadPath(filename))
    62  	err = os.MkdirAll(dir, 0777) // let umask cull the perms down...
    63  	if err != nil {
    64  		return err
    65  	}
    66  
    67  	outfile, err := os.Create(path.Join(dir, filename))
    68  	if err != nil {
    69  		return err
    70  	}
    71  	defer outfile.Close()
    72  
    73  	gzw := gzip.NewWriter(outfile)
    74  	defer gzw.Close()
    75  
    76  	return warc.Write(gzw, resp, srcURL, timeStamp)
    77  }